Repository: PaddlePaddle/PaddleFleetX
Branch: develop
Commit: 20f33ad21e9d
Files: 507
Total size: 2.6 MB

Directory structure:
gitextract_it7z4sjw/

├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── benchmarks/
│   ├── README.md
│   └── test_tipc/
│       ├── ernie/
│       │   └── dygraph/
│       │       └── hybrid_parallel/
│       │           ├── N1C1/
│       │           │   ├── ernie_bs16_fp16_DP1-MP1-PP1.sh
│       │           │   └── ernie_bs16_fp32_DP1-MP1-PP1.sh
│       │           ├── N1C8/
│       │           │   ├── ernie_bs16_fp16_DP2-MP2-PP2.sh
│       │           │   └── ernie_bs16_fp32_DP2-MP2-PP2.sh
│       │           ├── N4C32/
│       │           │   ├── ernie_bs16_fp16_DP1-MP8-PP4.sh
│       │           │   ├── ernie_bs16_fp16_DP2-MP8-PP2.sh
│       │           │   ├── ernie_bs16_fp16_DP4-MP8-PP1.sh
│       │           │   ├── ernie_bs16_fp32_DP1-MP8-PP4.sh
│       │           │   ├── ernie_bs16_fp32_DP2-MP8-PP2.sh
│       │           │   └── ernie_bs16_fp32_DP4-MP8-PP1.sh
│       │           └── benchmark_common/
│       │               ├── prepare.sh
│       │               └── run_benchmark.sh
│       ├── gpt/
│       │   ├── dygraph/
│       │   │   ├── data_parallel/
│       │   │   │   ├── N1C8/
│       │   │   │   │   ├── gpt_1024_bs64_fp16_DP8-MP1-PP1.sh
│       │   │   │   │   ├── gpt_1024_flash_bs64_fp16_DP8-MP1-PP1.sh
│       │   │   │   │   └── gpt_2048_bs64_fp16_DP8-MP1-PP1.sh
│       │   │   │   └── benchmark_common/
│       │   │   │       ├── prepare.sh
│       │   │   │       └── run_benchmark.sh
│       │   │   ├── finetune/
│       │   │   │   ├── N1C1/
│       │   │   │   │   ├── CE_gpt_finetune_CoLA_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_MRPC_acc_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_MRPC_f1_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_QNLI_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_RTE_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_SST2_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_STSB_pearson_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   ├── CE_gpt_finetune_STSB_spearman_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   └── CE_gpt_finetune_WNLI_bs32_fp16_DP1-MP1-PP1.sh
│       │   │   │   └── benchmark_common/
│       │   │   │       ├── prepare.sh
│       │   │   │       └── run_benchmark.sh
│       │   │   ├── hybrid_parallel/
│       │   │   │   ├── N1C1/
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP1-PP1.sh
│       │   │   │   │   └── gpt_bs16_fp32_DP1-MP1-PP1.sh
│       │   │   │   ├── N1C4/
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP1-PP4.sh
│       │   │   │   │   └── gpt_bs16_fp16_DP1-MP4-PP1.sh
│       │   │   │   ├── N1C8/
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP1-PP8.sh
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP2-PP4.sh
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP4-PP2.sh
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP8-PP1.sh
│       │   │   │   │   ├── gpt_bs16_fp16_DP2-MP2-PP2.sh
│       │   │   │   │   ├── gpt_bs16_fp32_DP2-MP2-PP2.sh
│       │   │   │   │   ├── gpt_bs64_fp16_DP8-MP1-PP1.sh
│       │   │   │   │   ├── gpt_bs64_fp32_DP8-MP1-PP1.sh
│       │   │   │   │   ├── gpt_recompute_bs16_fp16_DP2-MP2-PP2.sh
│       │   │   │   │   └── gpt_recompute_bs16_fp32_DP2-MP2-PP2.sh
│       │   │   │   ├── N4C32/
│       │   │   │   │   ├── gpt_bs16_fp16_DP1-MP8-PP4.sh
│       │   │   │   │   ├── gpt_bs16_fp16_DP2-MP8-PP2.sh
│       │   │   │   │   ├── gpt_bs16_fp16_DP4-MP8-PP1.sh
│       │   │   │   │   ├── gpt_bs16_fp32_DP1-MP8-PP4.sh
│       │   │   │   │   ├── gpt_bs16_fp32_DP2-MP8-PP2.sh
│       │   │   │   │   └── gpt_bs16_fp32_DP4-MP8-PP1.sh
│       │   │   │   └── benchmark_common/
│       │   │   │       ├── prepare.sh
│       │   │   │       └── run_benchmark.sh
│       │   │   ├── sequence_parallel/
│       │   │   │   ├── N1C8/
│       │   │   │   │   ├── gpt_sp_False_bs8_fp16_DP1-MP8-PP1.sh
│       │   │   │   │   └── gpt_sp_True_bs8_fp16_DP1-MP8-PP1.sh
│       │   │   │   ├── N4C32/
│       │   │   │   │   ├── gpt_sp_False_bs16_fp16_DP2-MP8-PP2.sh
│       │   │   │   │   └── gpt_sp_True_bs16_fp16_DP2-MP8-PP2.sh
│       │   │   │   └── benchmark_common/
│       │   │   │       ├── prepare.sh
│       │   │   │       └── run_benchmark.sh
│       │   │   └── sharding/
│       │   │       ├── N1C2/
│       │   │       │   ├── gpt_stage2_bs16_fp16_DP1-MP1-PP1-Sharding2.sh
│       │   │       │   ├── gpt_stage3_bs16_fp16_DP1-MP1-PP1-Sharding2.sh
│       │   │       │   └── gpt_stage3_bs16_fp32_DP1-MP1-PP1-Sharding2.sh
│       │   │       ├── N2C16/
│       │   │       │   └── gpt_stage2_bs128_fp16_DP1-MP1-PP1-Sharding16.sh
│       │   │       └── benchmark_common/
│       │   │           ├── prepare.sh
│       │   │           └── run_benchmark.sh
│       │   └── static/
│       │       └── auto_parallel/
│       │           ├── N1C1/
│       │           │   └── gpt_auto_recompute_bs8_fp32_DP1-MP1-PP1.sh
│       │           └── benchmark_common/
│       │               ├── prepare.sh
│       │               └── run_benchmark.sh
│       ├── imagen/
│       │   └── dygraph/
│       │       ├── N1C1/
│       │       │   ├── imagen_397M_text2im_64_bs1_fp32_DP1-MP1-PP1.sh
│       │       │   └── imagen_SR256_bs1_fp32_DP1-MP1-PP1.sh
│       │       ├── N1C8/
│       │       │   ├── imagen_2B_text2im_64_bs8_fp32_DP1-Sharding8.sh
│       │       │   ├── imagen_397M_text2im_64_bs8_fp32_DP8-MP1-PP1.sh
│       │       │   ├── imagen_SR256_bs8_fp32_DP8-MP1-PP1.sh
│       │       │   └── imagen_text2im_64_debertav2_bs8_fp32_DP8-MP1-PP1.sh
│       │       └── benchmark_common/
│       │           ├── prepare.sh
│       │           └── run_benchmark.sh
│       └── vit/
│           └── dygraph/
│               ├── finetune/
│               │   ├── N1C8/
│               │   │   ├── ViT_large_patch16_384_ft_fused_False_bs512_fp16_DP.sh
│               │   │   └── ViT_large_patch16_384_ft_fused_True_bs512_fp16_DP.sh
│               │   └── benchmark_common/
│               │       ├── prepare.sh
│               │       └── run_benchmark.sh
│               └── pretrained/
│                   ├── N2C16/
│                   │   ├── ViT_large_patch16_224_pt_fused_False_bs128_fp16_DP.sh
│                   │   └── ViT_large_patch16_224_pt_fused_True_bs128_fp16_DP.sh
│                   └── benchmark_common/
│                       ├── prepare.sh
│                       └── run_benchmark.sh
├── codestyle/
│   ├── .gitignore
│   ├── clang_format.hook
│   ├── copyright.hook
│   ├── cpplint_pre_commit.hook
│   ├── docstring_checker.py
│   ├── pylint_pre_commit.hook
│   └── test_docstring_checker.py
├── docs/
│   ├── cluster_deployment.md
│   ├── compression.md
│   ├── deployment_faq.md
│   ├── docker_install.md
│   ├── quick_start.md
│   └── standard.md
├── examples/
│   └── transformer/
│       ├── __init__.py
│       ├── models/
│       │   └── GPT/
│       │       ├── docs/
│       │       │   ├── README.md
│       │       │   ├── hybrid_parallel.md
│       │       │   ├── hybrid_profiler.md
│       │       │   ├── inference.md
│       │       │   ├── quantization_aware_training.md
│       │       │   ├── single_card.md
│       │       │   ├── single_finetune.md
│       │       │   └── structured_pruning.md
│       │       ├── finetune/
│       │       │   ├── configs/
│       │       │   │   ├── finetune_gpt_345M_single_card_glue.yaml
│       │       │   │   └── finetune_gpt_base.yaml
│       │       │   ├── impls.py
│       │       │   ├── run.py
│       │       │   └── run_task.sh
│       │       ├── generation/
│       │       │   ├── configs/
│       │       │   │   ├── generation_gpt_345M_dp8.yaml
│       │       │   │   ├── generation_gpt_345M_single_card.yaml
│       │       │   │   ├── generation_gpt_base.yaml
│       │       │   │   ├── generation_pruned_gpt_345M_single_card.yaml
│       │       │   │   ├── generation_qat_gpt_345M_single_card.yaml
│       │       │   │   ├── generation_qat_gpt_6.7B_single_card.yaml
│       │       │   │   ├── inference_gpt_345M_dp8.yaml
│       │       │   │   └── inference_gpt_345M_single_card.yaml
│       │       │   ├── export.py
│       │       │   ├── impls.py
│       │       │   ├── inference.py
│       │       │   └── run.py
│       │       ├── offline-eval/
│       │       │   ├── configs/
│       │       │   │   ├── eval_gpt_345M_single_card.yaml
│       │       │   │   ├── eval_gpt_base.yaml
│       │       │   │   ├── eval_pruned_gpt_345M_single_card.yaml
│       │       │   │   └── eval_qat_gpt_345M_single_card.yaml
│       │       │   ├── impls.py
│       │       │   └── run.py
│       │       ├── pretrain/
│       │       │   ├── configs/
│       │       │   │   ├── export_qat_gpt_345M_single_card.yaml
│       │       │   │   ├── pretrain_gpt_1.3B_dp8.yaml
│       │       │   │   ├── pretrain_gpt_1.3B_single_card.yaml
│       │       │   │   ├── pretrain_gpt_175B_mp8_pp16.yaml
│       │       │   │   ├── pretrain_gpt_345M_single_card.yaml
│       │       │   │   ├── pretrain_gpt_6.7B_sharding16.yaml
│       │       │   │   ├── pretrain_gpt_base.yaml
│       │       │   │   ├── pretrain_gpt_cn_345M_single_card.yaml
│       │       │   │   ├── prune_gpt_345M_single_card.yaml
│       │       │   │   ├── qat_gpt_345M_mp8.yaml
│       │       │   │   ├── qat_gpt_345M_single_card.yaml
│       │       │   │   └── qat_gpt_6.7B_sharding16.yaml
│       │       │   ├── export.py
│       │       │   ├── impls.py
│       │       │   └── run.py
│       │       └── pretrain_moe/
│       │           ├── configs/
│       │           │   ├── pretrain_moe_345M_single_card.yaml
│       │           │   └── pretrain_moe_base.yaml
│       │           ├── impls.py
│       │           └── run.py
│       └── utils/
│           ├── __init__.py
│           ├── components.py
│           ├── config.py
│           └── qat.py
├── ppfleetx/
│   ├── __init__.py
│   ├── configs/
│   │   ├── multimodal/
│   │   │   └── imagen/
│   │   │       ├── imagen_397M_text2im_64x64.yaml
│   │   │       ├── imagen_base.yaml
│   │   │       ├── imagen_super_resolution_1024.yaml
│   │   │       ├── imagen_super_resolution_256.yaml
│   │   │       ├── imagen_text2im_64x64_DebertaV2.yaml
│   │   │       └── imagen_text2im_64x64_T5-11B.yaml
│   │   ├── nlp/
│   │   │   ├── ernie/
│   │   │   │   ├── auto/
│   │   │   │   │   ├── finetune_ernie_345M_single_card.yaml
│   │   │   │   │   ├── finetune_ernie_base.yaml
│   │   │   │   │   ├── pretrain_ernie_base.yaml
│   │   │   │   │   └── pretrain_ernie_base_345M_single_card.yaml
│   │   │   │   ├── finetune_ernie_345M_single_card.yaml
│   │   │   │   ├── finetune_ernie_base.yaml
│   │   │   │   ├── inference_ernie_345M_single_card.yaml
│   │   │   │   ├── pretrain_ernie_base.yaml
│   │   │   │   ├── pretrain_ernie_base_175B_mp8_pp16.yaml
│   │   │   │   ├── pretrain_ernie_base_345M_single_card.yaml
│   │   │   │   ├── pretrain_ernie_base_3D.yaml
│   │   │   │   ├── pretrain_ernie_base_6.7B_sharding16.yaml
│   │   │   │   ├── pretrain_ernie_large_single_card.yaml
│   │   │   │   └── qat_ernie_base.yaml
│   │   │   ├── gpt/
│   │   │   │   ├── auto/
│   │   │   │   │   ├── export_gpt_fp16_single_card.yaml
│   │   │   │   │   ├── generation_gpt_175B_mp8.yaml
│   │   │   │   │   ├── generation_gpt_345M_mp2.yaml
│   │   │   │   │   ├── generation_gpt_345M_single_card.yaml
│   │   │   │   │   ├── generation_gpt_6.7B_mp1.yaml
│   │   │   │   │   ├── pretrain_gpt_1.3B_dp8.yaml
│   │   │   │   │   ├── pretrain_gpt_1.3B_dp8_tuning.yaml
│   │   │   │   │   ├── pretrain_gpt_1.3B_single_card.yaml
│   │   │   │   │   ├── pretrain_gpt_345M_single_card.yaml
│   │   │   │   │   ├── pretrain_gpt_6.7B_sharding16.yaml
│   │   │   │   │   ├── pretrain_gpt_base.yaml
│   │   │   │   │   └── qat_generation_gpt_345M_mp2.yaml
│   │   │   │   ├── eval_gpt_345M_single_card.yaml
│   │   │   │   ├── eval_pruned_gpt_345M_single_card.yaml
│   │   │   │   ├── eval_qat_gpt_345M_single_card.yaml
│   │   │   │   ├── export_qat_gpt_345M_single_card.yaml
│   │   │   │   ├── finetune_gpt_345M_single_card_glue.yaml
│   │   │   │   ├── finetune_gpt_base.yaml
│   │   │   │   ├── generation_gpt_345M_dp8.yaml
│   │   │   │   ├── generation_gpt_345M_mp1.yaml
│   │   │   │   ├── generation_gpt_345M_single_card.yaml
│   │   │   │   ├── generation_gpt_6.7B_single_mp1.yaml
│   │   │   │   ├── generation_pruned_gpt_345M_single_card.yaml
│   │   │   │   ├── generation_qat_gpt_345M_single_card.yaml
│   │   │   │   ├── generation_qat_gpt_6.7B_single_card.yaml
│   │   │   │   ├── inference_gpt_345M_dp8.yaml
│   │   │   │   ├── inference_gpt_345M_single_card.yaml
│   │   │   │   ├── pretrain_gpt_1.3B_dp8.yaml
│   │   │   │   ├── pretrain_gpt_1.3B_single_card.yaml
│   │   │   │   ├── pretrain_gpt_13B_dp8.yaml
│   │   │   │   ├── pretrain_gpt_175B_mp8_pp16.yaml
│   │   │   │   ├── pretrain_gpt_345M_single_card.yaml
│   │   │   │   ├── pretrain_gpt_6.7B_sharding16.yaml
│   │   │   │   ├── pretrain_gpt_6.7B_single_card.yaml
│   │   │   │   ├── pretrain_gpt_base.yaml
│   │   │   │   ├── pretrain_gpt_cn_345M_single_card.yaml
│   │   │   │   ├── prune_gpt_345M_single_card.yaml
│   │   │   │   ├── qat_gpt_345M_mp8.yaml
│   │   │   │   ├── qat_gpt_345M_single_card.yaml
│   │   │   │   └── qat_gpt_6.7B_sharding16.yaml
│   │   │   └── moe/
│   │   │       ├── pretrain_moe_1.3B_dp8.yaml
│   │   │       └── pretrain_moe_base.yaml
│   │   └── vis/
│   │       ├── base.yaml
│   │       ├── moco/
│   │       │   ├── moco_lincls_in1k_1n8c.yaml
│   │       │   ├── mocov1_pt_in1k_1n8c.yaml
│   │       │   └── mocov2_pt_in1k_1n8c.yaml
│   │       └── vit/
│   │           ├── ViT_base_patch16_224_inference.yaml
│   │           ├── ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml
│   │           ├── ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
│   │           ├── ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml
│   │           ├── ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml
│   │           ├── ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
│   │           ├── ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml
│   │           ├── ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml
│   │           └── auto/
│   │               ├── ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml
│   │               └── base.yaml
│   ├── core/
│   │   ├── __init__.py
│   │   ├── engine/
│   │   │   ├── __init__.py
│   │   │   ├── auto_engine.py
│   │   │   ├── basic_engine.py
│   │   │   ├── eager_engine.py
│   │   │   └── inference_engine.py
│   │   └── module/
│   │       ├── __init__.py
│   │       └── basic_module.py
│   ├── data/
│   │   ├── __init__.py
│   │   ├── data_tools/
│   │   │   ├── __init__.py
│   │   │   ├── cpp/
│   │   │   │   ├── Makefile
│   │   │   │   ├── __init__.py
│   │   │   │   ├── compile.py
│   │   │   │   └── fast_index_map_helpers.cpp
│   │   │   ├── ernie/
│   │   │   │   ├── __init__.py
│   │   │   │   └── preprocess/
│   │   │   │       ├── README.md
│   │   │   │       ├── __init__.py
│   │   │   │       ├── create_pretraining_data.py
│   │   │   │       ├── docs/
│   │   │   │       │   ├── CLUECorpus2020.md
│   │   │   │       │   ├── CLUECorpusSmall.md
│   │   │   │       │   ├── OpenWebText2.md
│   │   │   │       │   └── WuDaoCorpusBase.md
│   │   │   │       ├── trans_to_json.py
│   │   │   │       └── words_segmentation.py
│   │   │   └── gpt/
│   │   │       ├── README.md
│   │   │       ├── __init__.py
│   │   │       ├── preprocess_data.py
│   │   │       └── raw_trans_to_json.py
│   │   ├── dataset/
│   │   │   ├── __init__.py
│   │   │   ├── ernie/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── dataset_utils.py
│   │   │   │   └── ernie_dataset.py
│   │   │   ├── glue_dataset.py
│   │   │   ├── gpt_dataset.py
│   │   │   ├── multimodal_dataset.py
│   │   │   └── vision_dataset.py
│   │   ├── sampler/
│   │   │   ├── __init__.py
│   │   │   ├── batch_sampler.py
│   │   │   └── collate.py
│   │   ├── tokenizers/
│   │   │   ├── __init__.py
│   │   │   ├── debertav2_tokenizer.py
│   │   │   ├── ernie_tokenizer.py
│   │   │   ├── gpt_tokenizer.py
│   │   │   ├── t5_tokenization_utils.py
│   │   │   ├── t5_tokenizer.py
│   │   │   └── tokenization_utils_base.py
│   │   ├── transforms/
│   │   │   ├── __init__.py
│   │   │   ├── preprocess.py
│   │   │   └── utils.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       └── batch_collate_fn.py
│   ├── distributed/
│   │   ├── __init__.py
│   │   ├── apis/
│   │   │   ├── __init__.py
│   │   │   ├── amp.py
│   │   │   ├── comm_groups.py
│   │   │   ├── env.py
│   │   │   ├── io.py
│   │   │   └── strategy.py
│   │   └── protein_folding/
│   │       ├── __init__.py
│   │       ├── bp.py
│   │       ├── dap.py
│   │       ├── dp.py
│   │       └── scg.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── language_model/
│   │   │   ├── __init__.py
│   │   │   ├── auto_utils.py
│   │   │   ├── debertav2/
│   │   │   │   ├── __init__.py
│   │   │   │   └── modeling.py
│   │   │   ├── ernie/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── auto/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── auto_model.py
│   │   │   │   │   ├── auto_module.py
│   │   │   │   │   └── auto_transformer.py
│   │   │   │   ├── dygraph/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── hybrid_model.py
│   │   │   │   │   └── single_model.py
│   │   │   │   ├── ernie_module.py
│   │   │   │   ├── finetune_configs.yaml
│   │   │   │   └── layers/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── distributed_transformer.py
│   │   │   │       ├── model_outputs.py
│   │   │   │       ├── transformer.py
│   │   │   │       └── utils.py
│   │   │   ├── gpt/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── auto/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── auto_model.py
│   │   │   │   │   └── auto_module.py
│   │   │   │   └── dygraph/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── hybrid_model.py
│   │   │   │       ├── processor.py
│   │   │   │       ├── sequence_parallel_utils.py
│   │   │   │       └── single_model.py
│   │   │   ├── language_module.py
│   │   │   ├── metrics.py
│   │   │   ├── moe/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── comm/
│   │   │   │   │   └── __init__.py
│   │   │   │   ├── comm_ops.py
│   │   │   │   ├── gate/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base_gate.py
│   │   │   │   │   ├── gshard_gate.py
│   │   │   │   │   ├── naive_gate.py
│   │   │   │   │   └── switch_gate.py
│   │   │   │   ├── moe_layer.py
│   │   │   │   └── utils.py
│   │   │   ├── moe_exp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── experts.py
│   │   │   │   ├── layer.py
│   │   │   │   ├── mappings.py
│   │   │   │   └── sharded_moe.py
│   │   │   ├── t5/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── modeling.py
│   │   │   │   └── utils.py
│   │   │   └── utils.py
│   │   ├── multimodal_model/
│   │   │   ├── __init__.py
│   │   │   ├── clip/
│   │   │   │   └── __init__.py
│   │   │   ├── imagen/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── modeling.py
│   │   │   │   ├── unet.py
│   │   │   │   └── utils.py
│   │   │   ├── multimodal_module.py
│   │   │   └── utils.py
│   │   ├── protein_folding/
│   │   │   ├── __init__.py
│   │   │   ├── all_atom.py
│   │   │   ├── attentions.py
│   │   │   ├── common.py
│   │   │   ├── evoformer.py
│   │   │   ├── outer_product_mean.py
│   │   │   ├── quat_affine.py
│   │   │   ├── r3.py
│   │   │   ├── residue_constants.py
│   │   │   └── template.py
│   │   └── vision_model/
│   │       ├── __init__.py
│   │       ├── factory.py
│   │       ├── general_classification_module.py
│   │       ├── layers/
│   │       │   ├── __init__.py
│   │       │   ├── attention.py
│   │       │   ├── droppath.py
│   │       │   ├── embedding.py
│   │       │   ├── identity.py
│   │       │   ├── initializer.py
│   │       │   └── mlp.py
│   │       ├── loss/
│   │       │   ├── __init__.py
│   │       │   └── cross_entropy.py
│   │       ├── metrics/
│   │       │   ├── __init__.py
│   │       │   └── accuracy.py
│   │       ├── moco/
│   │       │   ├── __init__.py
│   │       │   └── moco.py
│   │       ├── moco_module.py
│   │       ├── resnet/
│   │       │   └── __init__.py
│   │       └── vit/
│   │           ├── __init__.py
│   │           └── vit.py
│   ├── ops/
│   │   ├── setup_cuda.py
│   │   ├── test_topp_sampling.py
│   │   └── topp_sampling.cu
│   ├── optims/
│   │   ├── __init__.py
│   │   ├── grad_clip.py
│   │   ├── lr_scheduler.py
│   │   └── optimizer.py
│   ├── tools/
│   │   ├── __init__.py
│   │   └── multiprocess_tool.py
│   └── utils/
│       ├── __init__.py
│       ├── check.py
│       ├── compression_helper.py
│       ├── config.py
│       ├── device.py
│       ├── download.py
│       ├── export.py
│       ├── file.py
│       ├── log.py
│       ├── tensor_fusion_helper.py
│       └── version.py
├── projects/
│   ├── ernie/
│   │   ├── auto_export_ernie_345M_mp1.sh
│   │   ├── auto_export_ernie_345M_mp2.sh
│   │   ├── auto_export_ernie_345M_mp2_npu.sh
│   │   ├── auto_export_ernie_345M_mp2_xpu.sh
│   │   ├── docs/
│   │   │   ├── README.md
│   │   │   └── inference.md
│   │   ├── export_ernie_345M_single_card.sh
│   │   ├── finetune_ernie_345M_single_card.sh
│   │   ├── finetune_ernie_345M_single_card_npu.sh
│   │   ├── inference.py
│   │   ├── pretrain_ernie_base.sh
│   │   ├── pretrain_ernie_base_175B_mp8_pp16.sh
│   │   ├── pretrain_ernie_base_3D.sh
│   │   ├── pretrain_ernie_base_3D_npu.sh
│   │   ├── pretrain_ernie_base_6.7B_sharding16.sh
│   │   ├── pretrain_ernie_large.sh
│   │   ├── pretrain_ernie_large_mp2_mlu.sh
│   │   ├── pretrain_ernie_large_mp2_npu.sh
│   │   ├── pretrain_ernie_large_mp2_pp2_npu.sh
│   │   ├── pretrain_ernie_large_npu.sh
│   │   ├── run_inference.sh
│   │   ├── run_inference_mp2.sh
│   │   ├── run_inference_mp2_npu.sh
│   │   └── run_inference_mp2_xpu.sh
│   ├── gpt/
│   │   ├── auto_export_gpt_175B_mp8.sh
│   │   ├── auto_export_gpt_345M_mp2.sh
│   │   ├── auto_export_gpt_345M_single_card.sh
│   │   ├── auto_export_gpt_6.7B_mp1.sh
│   │   ├── auto_export_gpt_fp16_single_card.sh
│   │   ├── auto_gpt_1.3B_dp8.sh
│   │   ├── auto_gpt_1.3B_dp8_tuning.sh
│   │   ├── auto_gpt_1.3B_single_card.sh
│   │   ├── auto_gpt_345M_single_card.sh
│   │   ├── auto_gpt_6.7B_sharding16.sh
│   │   ├── auto_qat_export_gpt_345M_mp2.sh
│   │   ├── benchmark.py
│   │   ├── docs/
│   │   │   ├── README.md
│   │   │   ├── auto_parallel.md
│   │   │   ├── hybrid_parallel.md
│   │   │   ├── hybrid_profiler.md
│   │   │   ├── inference.md
│   │   │   ├── quantization_aware_training.md
│   │   │   ├── single_card.md
│   │   │   ├── single_finetune.md
│   │   │   └── structured_pruning.md
│   │   ├── eval_prune_gpt_345M_single_card.sh
│   │   ├── eval_qat_gpt_345M_single_card.sh
│   │   ├── evaluate_gpt_345M_single_card.sh
│   │   ├── export_gpt_345M_single_card.sh
│   │   ├── export_prune_gpt_345M_single_card.sh
│   │   ├── export_qat_gpt_345M_single_card.sh
│   │   ├── finetune_gpt_345M_single_card.sh
│   │   ├── inference.py
│   │   ├── inference_gpt_6.7B_single_card.sh
│   │   ├── inference_gpt_multigpu.sh
│   │   ├── inference_gpt_single_card.sh
│   │   ├── pretrain_gpt_1.3B_dp8.sh
│   │   ├── pretrain_gpt_1.3B_single_card.sh
│   │   ├── pretrain_gpt_175B_mp8_pp16.sh
│   │   ├── pretrain_gpt_345M_single_card.sh
│   │   ├── pretrain_gpt_6.7B_sharding16.sh
│   │   ├── prune_gpt_345M_single_card.sh
│   │   ├── qat_gpt_345M_mp8.sh
│   │   ├── qat_gpt_345M_single_card.sh
│   │   ├── qat_gpt_6.7B_sharding16.sh
│   │   └── run_benchmark.sh
│   ├── imagen/
│   │   ├── README.md
│   │   ├── filelist/
│   │   │   └── laion_400M/
│   │   │       └── train
│   │   ├── run_super_resolution_1024_sharding128.sh
│   │   ├── run_super_resolution_256_dp128.sh
│   │   ├── run_super_resolution_256_single_card.sh
│   │   ├── run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh
│   │   ├── run_text2im_397M_64x64_dp128.sh
│   │   ├── run_text2im_397M_64x64_single_card.sh
│   │   └── run_text2im_64x64_DebertaV2_dp8.sh
│   ├── moco/
│   │   ├── README.md
│   │   ├── run_mocov1_lincls_in1k.sh
│   │   ├── run_mocov1_pretrain_in1k.sh
│   │   ├── run_mocov2_lincls_in1k.sh
│   │   └── run_mocov2_pretrain_in1k.sh
│   ├── protein_folding/
│   │   └── README.md
│   ├── ufo2.0/
│   │   └── README.md
│   └── vit/
│       ├── README.md
│       ├── auto_vit_patch16_224_dp8.sh
│       ├── docs/
│       │   └── inference.md
│       ├── export_qat.sh
│       ├── inference.py
│       ├── run_finetune.sh
│       ├── run_finetune_fused_attention.sh
│       ├── run_inference_base_patch16_224.sh
│       ├── run_pretrain.sh
│       ├── run_pretrained_fused_attention.sh
│       └── run_qat.sh
├── requirements.txt
├── setup.py
├── tasks/
│   └── gpt/
│       ├── generation.py
│       ├── inference.py
│       └── run_generation.sh
└── tools/
    ├── auto.py
    ├── auto_export.py
    ├── eval.py
    ├── export.py
    ├── inference.py
    └── train.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

.DS_Store
.idea


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
    sha: v1.0.1
    hooks:
    -   id: remove-crlf
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
    hooks:
    -   id: yapf
        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
-   repo: https://github.com/pre-commit/pre-commit-hooks
    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
    hooks:
    -   id: check-added-large-files
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
-   repo: local
    hooks:
    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
        entry: bash ./codestyle/clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
-   repo: local
    hooks:
    -   id: cpplint-cpp-source
        name: cpplint
        description: Check C++ code style using cpplint.py.
        entry: bash ./codestyle/cpplint_pre_commit.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
-   repo: local
    hooks:
    -   id: pylint-doc-string
        name: pylint
        description: Check python docstring style using docstring_checker.
        entry: bash ./codestyle/pylint_pre_commit.hook
        language: system
        files: \.(py)$
-   repo: local
    hooks:
    -   id: copyright_checker
        name: copyright_checker
        entry: python ./codestyle/copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$


================================================
FILE: Dockerfile
================================================
ARG BASE_IMAGE=registry.baidubce.com/paddlepaddle/paddle:2.4.1-gpu-cuda11.2-cudnn8.2-trt8.0

FROM $BASE_IMAGE

WORKDIR /paddle

RUN python -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html

# RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetx/develop/requirements.txt && python -m pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple
COPY requirements.txt /paddle

RUN python -m pip install -r requirements.txt #-i https://mirror.baidu.com/pypi/simple

ENV LD_LIBRARY_PATH=/usr/lib64/:${LD_LIBRARY_PATH}


================================================
FILE: LICENSE
================================================
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: README.md
================================================
<p align="center">
  <img src="./paddlefleetx-logo.png" align="middle"  width="350" />
</p>

------------------------------------------------------------------------------------------

<p align="center">
    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
    <a href="https://github.com/PaddlePaddle/PaddleFleetX/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleFleetX?color=ffa"></a>
    <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
    <a href="https://github.com/PaddlePaddle/PaddleFleetX/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleFleetX?color=9ea"></a>
    <a href="https://github.com/PaddlePaddle/PaddleFleetX/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleFleetX?color=9cc"></a>
    <a href="https://github.com/PaddlePaddle/PaddleFleetX/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleFleetX?color=ccf"></a>
</p>

## 简介

PaddleFleetX是基于飞桨深度学习框架开发的大模型套件，旨在提供高性能、灵活易用的大模型全流程应用能力，在**开发**、**训练**、**精调**、**压推**、**推理**、**部署**六大环节提供端到端全流程优化。

<p align="center">
  <img width="1000" alt="飞桨大模型套件" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/ab5e87cc-df52-48cb-9968-8951d3b164ba">
</p>

## 特色介绍

### 大模型开发：动静统一开发模式，4D混合并行策略灵活配置

<p align="center">
  <img width="771" alt="大模型开发" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/95d1c0e8-df92-489b-8472-0a8b438cbfcf">
</p>

基于飞桨动静统一的开发模式，大模型套件全面使用动态图开发，在Generate API中可自动完成算子融合具备静态图的调试性能。全场景统一训练器Trainer可以轻松完成4D混合并行的配置，在预训练与精调环节皆可使用。

### 大模型训练：发挥基础计算潜能、全面提升分布式效率

飞桨针对大模型训练，对数据读取、混合精度计算策略、高性能算子库、并行策略自动寻优、流水线调度的整个全流程实现优化，助力文心大模型训练速度提升3倍。

<p align="center">  
  <img width="1000" alt="飞桨支持大模型训练" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/3874440d-0b0c-4730-bbcb-f9b87900d75f">
</p>


### 大模型精调：主流精调算法实现性能全面领先

提供了主流的精调算法，包括SFT、Prefix-Tuning、LoRA三种主流的精调算法，有效降低的大模型训练的资源门槛。统一的训练器Trainer实现了预训练加速技术在精调场景的复用，并通过变长数据流优化大幅提升精调性能。

<p align="center">
  <img width="800" alt="大模型精调" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/0dad24ae-0549-4166-8426-b0a471a82450">
</p>


### 大模型压缩：自研量化压缩算法实现无损量化

飞桨自研的Shift-SmoothQuant算法相比SmoothQuant算法可以实现更平滑的激活分布，有效提升量化后模型的精度度和生成结果的稳定性。通过PaddleSlim的大模型压缩工具，我们在 C-Eval 和 NL2SQL 两个数据集上对主流开源大模型可以实现无损量化。更多技术介绍与使用说明可以参考[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)。

<p align="center">
  <img width="350" alt="模型压缩" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/8b8334d6-dc1a-4ab8-a2f6-dbbece6f0e1e">
</p>
<p align="center">
  <img width="798" alt="模型压缩" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/badb3f10-314a-4259-8179-08f940197352">
</p>

### 大模型推理：针对大模型场景特性匹配最优量化推理方案

Paddle Inference针对大模型Prompt阶段与Token Generation阶段的计算特性的不同，在通用场景提供静态量化，在访存受限场景提供混合量化与低比特的推理方案。

<p align="center">
  <img width="1000" alt="飞桨支撑大模型推理" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/6bf2a373-a550-4359-9285-6fa4337e550d">
</p>

<p align="center">
  <img width="400" alt="推理引擎" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/8d9ab6f9-fc63-4485-bcf2-f9791b1de273">
</p>


### 大模型部署：实时感知负载动态插入请求，最大化硬件利用率

由于大模型生成场景解码阶段耗时较长，且不同Query下生成长度不一，为了最大化服务吞吐，我们在FastDeploy服务框架结合推理引擎实现了动态插入技术，科实时感知服务负载，动态插入用户请求最大化推理硬件利用率。

<p align="center">
  <img width="350" alt="大模型服务部署" src="https://github.com/PaddlePaddle/PaddleFleetX/assets/1371212/d2e38f78-9088-4b1a-a9bd-1018385b5b86">
</p>


## PaddleFleetX 应用案例

### 大语言模型

基于PaddleFleetX的核心能力，我们在PaddleNLP中提供了丰富的大语言模型全流程开发与应用示例，更多详细使用说明可以参考[PaddleNLP大语言模型](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm)。

### 跨模态大模型

除了大语言模型外，PaddleFleetX还提供跨模态大模型的开发与训练，如多模态预训练、文生图扩散模型等，覆盖图片、文本、视频和音频等模态，更多详细使用说明可以参考[PaddleMIX](https://github.com/PaddlePaddle/PaddleMIX)。

### 生物计算大模型

在生物计算领域，基于飞桨4D并行策略与高性能优化，我们在PaddleHelix中提供众多业界领先的生物计算预训练模型，更多详细使用说明可以参考[PaddleHelix](https://github.com/PaddlePaddle/PaddleHelix)。


## Citation

```
@misc{paddlefleetx,
    title={PaddleFleetX: An Easy-to-use and High-Performance One-stop Tool for Deep Learning},
    author={PaddleFleetX Contributors},
    howpublished = {\url{https://github.com/PaddlePaddle/PaddleFleetX}},
    year={2022}
}
```

## License

PaddleFleetX 基于 [Apache 2.0 license](./LICENSE) 许可发布。


================================================
FILE: benchmarks/README.md
================================================


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C1/ernie_bs16_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1

model=ernie
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C1/ernie_bs16_fp32_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp32
run_mode=DP1-MP1-PP1
device_num=N1C1

model=ernie
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C8/ernie_bs16_fp16_DP2-MP2-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP2-PP2
device_num=N1C8

model=ernie
micro_bs=2

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C8/ernie_bs16_fp32_DP2-MP2-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16
fp_item=fp32
run_mode=DP2-MP2-PP2
device_num=N1C8

model=ernie
micro_bs=2

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP1-MP8-PP4.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=1
mp_degree=8
pp_degree=4
bs_item=16
fp_item=fp16
run_mode=DP1-MP8-PP4
device_num=N4C32

model=ernie
micro_bs=2

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP2-MP8-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=2
mp_degree=8
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP8-PP2
device_num=N4C32

model=ernie
micro_bs=2

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP4-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=4
mp_degree=8
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP4-MP8-PP1
device_num=N4C32

model=ernie
micro_bs=4

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP1-MP8-PP4.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=1
mp_degree=8
pp_degree=4
bs_item=16
fp_item=fp32
run_mode=DP1-MP8-PP4
device_num=N4C32

model=ernie
micro_bs=2

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP2-MP8-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=2
mp_degree=8
pp_degree=2
bs_item=16
fp_item=fp32
run_mode=DP2-MP8-PP2
device_num=N4C32

model=ernie
micro_bs=2

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP4-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=ernie
dp_degree=4
mp_degree=8
pp_degree=1
bs_item=16
fp_item=fp32
run_mode=DP4-MP8-PP1
device_num=N4C32

model=ernie
micro_bs=4

cd ./benchmarks
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
rm -rf dataset/ernie
mkdir -p dataset/ernie
wget -O dataset/ernie/cluecorpussmall_14g_1207_ids_part0 https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_ids_part0
wget -O dataset/ernie/cluecorpussmall_14g_1207_ids_part1 https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_ids_part1
cat dataset/ernie/cluecorpussmall_14g_1207_ids_part* &> dataset/ernie/cluecorpussmall_14g_1207_ids.npy
wget -O dataset/ernie/cluecorpussmall_14g_1207_idx.npz https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_idx.npz


================================================
FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="tokens/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    max_iter=${10:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    use_sharding=${11:-"false"}               # （可选) 是否使用Sharding
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    use_recompute=${12:-"False"}    # (可选)是否打开recompute
    sharding_stage=${13:-"1"}       # (可选)sharding case
    sharding_offload=${14:-"False"} # (可选)
    eval_freq=${15:-"1000000"}         # (可选)
    sharding_degree=${16:-"1"}      # (可选)
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`
    num_attention_heads=16 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en"
    num_layers=24 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en"
    use_pure_fp16=False # fp32
    if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi
    train_cmd="-o Global.seed=1234 \
               -o Global.local_batch_size=${local_batch_size} \
               -o Global.micro_batch_size=${micro_batch_size} \
               -o Engine.max_steps=${max_iter} \
               -o Engine.eval_freq=${eval_freq} \
               -o Engine.mix_precision.enable=${use_pure_fp16} \
               -o Engine.save_load.save_steps=100000 \
               -o Model.hidden_size=1024 \
               -o Model.num_hidden_layers=${num_layers} \
               -o Model.num_attention_heads=${num_attention_heads} \
               -o Model.use_recompute=${use_recompute} \
               -o Data.Train.dataset.input_dir=./dataset/ernie \
               -o Data.Eval.dataset.input_dir=./dataset/ernie \
               -o Distributed.dp_degree=${dp_degree} \
               -o Distributed.mp_degree=${mp_degree} \
               -o Distributed.pp_degree=${pp_degree} \
               -o Distributed.sharding.sharding_degree=${sharding_degree} \
               -o Distributed.sharding.sharding_stage=${sharding_stage} \
               -o Distributed.sharding.sharding_offload=${sharding_offload} \
               -o Optimizer.lr.max_lr=1e-4 \
               -o Optimizer.lr.min_lr=1e-5 "

    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    # hybrid_parallelism case
    case ${run_mode} in
    DP1-MP1-PP1) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION} \
            tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP2-MP1-PP1) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        timeout 240m ${train_cmd} > ${log_file} 2>&1
    else
        timeout 15m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_1024_bs64_fp16_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_1024
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=64
fp_item=fp16
run_mode=DP8-MP1-PP1
device_num=N1C8
yaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_1024_flash_bs64_fp16_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_1024_flash
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=64
fp_item=fp16
run_mode=DP8-MP1-PP1
device_num=N1C8
yaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_2048_bs64_fp16_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_2048
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=64
fp_item=fp16
run_mode=DP8-MP1-PP1
device_num=N1C8
yaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
rm -rf data
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="tokens/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    yaml_path=${10:-"./pretrain/configs/pretrain_gpt_345M_single_card.yaml"}
    max_iter=${11:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    eval_freq=${12:-"1000"}         # (可选)模型评估间隔
    use_recompute=${13:-"False"}    # (可选)是否打开recompute
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    if [ ${model_item} = "gpt_1024_flash" ];then
        args="-o Model.use_flash_attn=True"
    else
        args=""
    fi

    train_cmd="-c ${yaml_path} ${args} \
               -o Engine.max_steps=${max_iter} \
               -o Engine.eval_freq=${eval_freq} \
               -o Engine.save_load.save_steps=100000 \
               -o Distributed.dp_degree=${dp_degree} \
               "

    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
    DP8-MP1-PP1) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            tools/train.py \
            ${train_cmd}"
        workerlog_id=0
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 15m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_CoLA_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_CoLA
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=mcc:
dataset=CoLA

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_MRPC_acc_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_MRPC_acc
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=acc:
dataset=MRPC

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_MRPC_f1_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_MRPC_f1
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=f1:
dataset=MRPC

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_QNLI_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_QNLI
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=acc:
dataset=QNLI

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_RTE_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_RTE
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=acc:
dataset=RTE

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_SST2_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_SST2
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=acc:
dataset=SST2

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_STSB_pearson_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_STSB_pearson
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=pearson:
dataset=STSB

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_STSB_spearman_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_STSB_spearman
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=spearman:
dataset=STSB

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_WNLI_bs32_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=CE_gpt_finetune_WNLI
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=32
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1
convergence_key=acc:
dataset=WNLI

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
# run
sed -i "s/num_train_epochs=5/num_train_epochs=20/g" ../projects/gpt/finetune_gpt_345M_single_card.sh
bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${convergence_key} ${dataset} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get ckpt
cd ../
rm -rf ckpt
mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="steps/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key=${10:-"loss:"}        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    dataset=${11:-"CoLA"}                 # 数据集
    max_iter=${12:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    base_batch_size=$global_batch_size
    sharding_degree=${13-"1"}      # (可选)
    sharding_stage=${14:-"1"}       # (可选)sharding case
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    # if [ ${model_item} = "gpt3_moe" ];then
    #     static_scripts="../examples/language_model/gpt-moe/dygraph/"
    # else
    #     echo "not supported model item: ${model_item}"; exit 1;
    # fi

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    # data_path="./data/"


    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`

    train_cmd="${dataset}"


    # 以下为通用执行命令，无特殊可不用修改

    # hybrid_parallelism case
    case ${run_mode} in
    DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1"
        train_cmd="bash projects/gpt/finetune_gpt_345M_single_card.sh \
            ${train_cmd}"
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"

    workerlog_id=0
    timeout 40m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C1/gpt_bs16_fp16_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP1-MP1-PP1
device_num=N1C1

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C1/gpt_bs16_fp32_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp32
run_mode=DP1-MP1-PP1
device_num=N1C1

model=gpt
micro_bs=${bs_item}

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C4/gpt_bs16_fp16_DP1-MP1-PP4.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=1
pp_degree=4
bs_item=16
fp_item=fp16
run_mode=DP1-MP1-PP4
device_num=N1C4

model=gpt
micro_bs=2

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C4/gpt_bs16_fp16_DP1-MP4-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=4
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP1-MP4-PP1
device_num=N1C4

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP1-PP8.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=1
pp_degree=8
bs_item=16
fp_item=fp16
run_mode=DP1-MP1-PP8
device_num=N1C8

model=gpt
micro_bs=2

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP2-PP4.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=2
pp_degree=4
bs_item=16
fp_item=fp16
run_mode=DP1-MP2-PP4
device_num=N1C8

model=gpt
micro_bs=2

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP4-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=4
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP1-MP4-PP2
device_num=N1C8

model=gpt
micro_bs=2

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=8
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP1-MP8-PP1
device_num=N1C8

model=gpt
micro_bs=16

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP2-MP2-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP2-PP2
device_num=N1C8

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp32_DP2-MP2-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16
fp_item=fp32
run_mode=DP2-MP2-PP2
device_num=N1C8

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs64_fp16_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=64
fp_item=fp16
run_mode=DP8-MP1-PP1
device_num=N1C8
max_iter=500
use_recompute=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${max_iter} ${use_recompute} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs64_fp32_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=64
fp_item=fp32
run_mode=DP8-MP1-PP1
device_num=N1C8
max_iter=500
use_recompute=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${max_iter} ${use_recompute} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_recompute_bs16_fp16_DP2-MP2-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_recompute
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP2-PP2
device_num=N1C8
max_iter=500
use_recompute=True

model=gpt
micro_bs=2

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${max_iter} ${use_recompute} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_recompute_bs16_fp32_DP2-MP2-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_recompute
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16
fp_item=fp32
run_mode=DP2-MP2-PP2
device_num=N1C8
max_iter=500
use_recompute=True

model=gpt
micro_bs=2

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${max_iter} ${use_recompute} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP1-MP8-PP4.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=8
pp_degree=4
bs_item=16
fp_item=fp16
run_mode=DP1-MP8-PP4
device_num=N4C32

model=gpt
micro_bs=4

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP2-MP8-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=2
mp_degree=8
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP8-PP2
device_num=N4C32

model=gpt
micro_bs=4

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP4-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=4
mp_degree=8
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP4-MP8-PP1
device_num=N4C32

model=gpt
micro_bs=4

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP1-MP8-PP4.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=1
mp_degree=8
pp_degree=4
bs_item=16
fp_item=fp32
run_mode=DP1-MP8-PP4
device_num=N4C32

model=gpt
micro_bs=4

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP2-MP8-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=2
mp_degree=8
pp_degree=2
bs_item=16
fp_item=fp32
run_mode=DP2-MP8-PP2
device_num=N4C32

model=gpt
micro_bs=4

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP4-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt
dp_degree=4
mp_degree=8
pp_degree=1
bs_item=16
fp_item=fp32
run_mode=DP4-MP8-PP1
device_num=N4C32

model=gpt
micro_bs=4

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
rm -rf data
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="tokens/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    max_iter=${10:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    use_recompute=${11:-"False"}    # (可选)是否打开recompute
    eval_freq=${12:-"1000"}         # (可选)模型评估间隔
    sharding_degree=${13:-"1"}      # (可选)分组切分并行维度
    sharding_stage=${14:-"1"}       # (可选)切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数
    sharding_offload=${15:-"False"} # (可选)CPU offload策略
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`
    num_attention_heads=16 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en"
    num_layers=24 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en"
    use_pure_fp16=False
    if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi
    train_cmd="-o Global.seed=1234 \
               -o Global.local_batch_size=${local_batch_size} \
               -o Global.micro_batch_size=${micro_batch_size} \
               -o Engine.max_steps=${max_iter} \
               -o Engine.eval_freq=${eval_freq} \
               -o Engine.mix_precision.enable=${use_pure_fp16} \
               -o Engine.save_load.save_steps=100000 \
               -o Model.hidden_size=1024 \
               -o Model.num_layers=${num_layers} \
               -o Model.num_attention_heads=${num_attention_heads} \
               -o Model.type_vocab_size=1 \
               -o Model.use_recompute=${use_recompute} \
               -o Distributed.dp_degree=${dp_degree} \
               -o Distributed.mp_degree=${mp_degree} \
               -o Distributed.pp_degree=${pp_degree} \
               -o Distributed.sharding.sharding_degree=${sharding_degree} \
               -o Distributed.sharding.sharding_stage=${sharding_stage} \
               -o Distributed.sharding.sharding_offload=${sharding_offload} \
               -o Optimizer.lr.max_lr=1e-4 \
               -o Optimizer.lr.min_lr=1e-5 "

    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
    DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\
              tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
              ${train_cmd}" 
        workerlog_id=0
        ;;
    DP1-MP1-PP4|DP1-MP4-PP1) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP8-MP1-PP1|DP1-MP8-PP1|DP1-MP1-PP8|DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2| \
    DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 15m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N1C8/gpt_sp_False_bs8_fp16_DP1-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_sp_False
dp_degree=1
mp_degree=8
pp_degree=1
bs_item=8
fp_item=fp16
run_mode=DP1-MP8-PP1
device_num=N1C8
sequence_parallel=False

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sequence_parallel} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N1C8/gpt_sp_True_bs8_fp16_DP1-MP8-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_sp_True
dp_degree=1
mp_degree=8
pp_degree=1
bs_item=8
fp_item=fp16
run_mode=DP1-MP8-PP1
device_num=N1C8
sequence_parallel=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sequence_parallel} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N4C32/gpt_sp_False_bs16_fp16_DP2-MP8-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_sp_False
dp_degree=2
mp_degree=8
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP8-PP2
device_num=N4C32
sequence_parallel=False

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sequence_parallel} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N4C32/gpt_sp_True_bs16_fp16_DP2-MP8-PP2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_sp_True
dp_degree=2
mp_degree=8
pp_degree=2
bs_item=16
fp_item=fp16
run_mode=DP2-MP8-PP2
device_num=N4C32
sequence_parallel=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sequence_parallel} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
rm -rf data
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="tokens/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    sequence_parallel=${10:-"False"}    # (可选)是否打开sequence_parallel
    max_iter=${11:-1000}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    eval_freq=${12:-"1000"}         # (可选)模型评估间隔
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    use_recompute=${13:-"True"}    # (可选)是否打开recompute
    sharding_degree=${14:-"1"}      # (可选)分组切分并行维度
    sharding_stage=${15:-"1"}       # (可选)切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数
    sharding_offload=${16:-"False"} # (可选)CPU offload策略
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`
    num_attention_heads=16 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en"
    num_layers=24 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en"
    use_pure_fp16=False
    if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi
    train_cmd="-o Engine.max_steps=${max_iter} \
               -o Engine.eval_iters=${eval_freq} \
               -o Distributed.dp_degree=${dp_degree} \
               -o Distributed.mp_degree=${mp_degree} \
               -o Distributed.pp_degree=${pp_degree} \
               -o Distributed.sharding.sharding_degree=${sharding_degree} \
               -o Distributed.sharding.sharding_stage=${sharding_stage} \
               -o Distributed.sharding.sharding_offload=${sharding_offload} \
               -o Model.sequence_parallel=${sequence_parallel}  \
               -o Distributed.sharding.reduce_overlap=False \
               -o Distributed.sharding.broadcast_overlap=False \
               -o Optimizer.tensor_fusion=False "

    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
    DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP1-MP8-PP1) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP2-MP8-PP2) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 60m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage2_bs16_fp16_DP1-MP1-PP1-Sharding2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_stage2
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP1-MP1-PP1-Sharding2
device_num=N1C2
sharding_degree=2
sharding_stage=2
sharding_offload=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage3_bs16_fp16_DP1-MP1-PP1-Sharding2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_stage3
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp16
run_mode=DP1-MP1-PP1-Sharding2
device_num=N1C2
sharding_degree=2
sharding_stage=3
sharding_offload=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage3_bs16_fp32_DP1-MP1-PP1-Sharding2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_stage3
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=16
fp_item=fp32
run_mode=DP1-MP1-PP1-Sharding2
device_num=N1C2
sharding_degree=2
sharding_stage=3
sharding_offload=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N2C16/gpt_stage2_bs128_fp16_DP1-MP1-PP1-Sharding16.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_stage2
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=128
fp_item=fp16
run_mode=DP1-MP1-PP1-Sharding16
device_num=N2C16
sharding_degree=16
sharding_stage=2
sharding_offload=True
max_iter=30

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${sharding_offload} ${max_iter} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
rm -rf data
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz


================================================
FILE: benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="tokens/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    sharding_degree=${10:-"1"}      # (可选)分组切分并行维度
    sharding_stage=${11:-"1"}       # (可选)切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数
    sharding_offload=${12:-"False"} # (可选)CPU offload策略
    max_iter=${13:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    eval_freq=${14:-"1000"}         # (可选)模型评估间隔
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    use_recompute=${15:-"True"}    # (可选)是否打开recompute
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`
    use_pure_fp16=False
    if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi
    train_cmd="-o Global.local_batch_size=${local_batch_size} \
               -o Global.micro_batch_size=${micro_batch_size} \
               -o Engine.max_steps=${max_iter} \
               -o Engine.eval_freq=${eval_freq} \
               -o Engine.mix_precision.enable=${use_pure_fp16} \
               -o Engine.save_load.save_steps=100000 \
               -o Model.use_recompute=${use_recompute} \
               -o Distributed.dp_degree=${dp_degree} \
               -o Distributed.mp_degree=${mp_degree} \
               -o Distributed.pp_degree=${pp_degree} \
               -o Distributed.sharding.sharding_degree=${sharding_degree} \
               -o Distributed.sharding.sharding_stage=${sharding_stage} \
               -o Distributed.sharding.sharding_offload=${sharding_offload} \
                "

    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
    DP1-MP1-PP1-Sharding2) echo "run run_mode: DP1-MP1-PP1-Sharding2"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1 ${PADDLE_RANK_OPTION}\
            ./tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
            -o Global.seed=1234 \
            -o Model.hidden_size=1024 \
            -o Model.num_layers=4 \
            -o Model.num_attention_heads=4 \
            -o Model.type_vocab_size=1 \
            -o Optimizer.lr.max_lr=1e-4 \
            -o Optimizer.lr.min_lr=1e-5 \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP1-MP1-PP1-Sharding16) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            ./tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \
            -o Engine.logging_freq=1 \
            ${train_cmd}"
        workerlog_id=0
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 70m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/gpt/static/auto_parallel/N1C1/gpt_auto_recompute_bs8_fp32_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=gpt_auto_recompute
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=8
fp_item=fp32
run_mode=DP1-MP1-PP1
device_num=N1C1
max_iter=500
use_recompute=True

model=gpt
micro_bs=8

cd ./benchmarks
bash ./test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${max_iter} ${use_recompute} 2>&1;


================================================
FILE: benchmarks/test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
rm -rf data
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz


================================================
FILE: benchmarks/test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="samples/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    max_iter=${10:-500}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    use_recompute=${11:-"False"}    # (可选)是否打开recompute
    verbose=${12:-"3"}         # (可选)是否打印性能数据
    logging_freq=${13:-"100000"} # (可选)loss打印频率
    sharding_degree=${14:-"1"}      # (可选)
    sharding_stage=${15:-"1"}       # (可选)sharding case
    
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`
    num_attention_heads=16 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en"
    num_layers=24 #"gpt2-medium-en"
    if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en"
    use_pure_fp16=False # fp32
    if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi
    train_cmd="-o Global.seed=1234 \
               -o Global.local_batch_size=${local_batch_size} \
               -o Global.micro_batch_size=${micro_batch_size} \
               -o Engine.max_steps=${max_iter} \
               -o Engine.eval_freq=100000 \
               -o Engine.mix_precision.enable=${use_pure_fp16} \
               -o Engine.save_load.save_steps=100000 \
               -o Model.hidden_size=1024 \
               -o Model.num_layers=${num_layers} \
               -o Model.num_attention_heads=${num_attention_heads} \
               -o Model.type_vocab_size=1 \
               -o Model.use_recompute=${use_recompute} \
               -o Distributed.dp_degree=${dp_degree} \
               -o Distributed.mp_degree=${mp_degree} \
               -o Distributed.pp_degree=${pp_degree} \
               -o Distributed.sharding.sharding_degree=${sharding_degree} \
               -o Distributed.sharding.sharding_stage=${sharding_stage} \
               -o Optimizer.lr.max_lr=1e-4 \
               -o Optimizer.lr.min_lr=1e-5  \
               -o Engine.verbose=${verbose} \
               -o Engine.logging_freq=${logging_freq} "

    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
    DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\
            tools/auto.py -c ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP2-MP2-PP2) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
            tools/auto.py -c ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \
            ${train_cmd}"
        workerlog_id_1=4
        workerlog_id_2=6
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 20m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id_1} ${log_file}
        cp mylog/workerlog.${workerlog_id_2} ${log_file}_2
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/N1C1/imagen_397M_text2im_64_bs1_fp32_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=imagen_397M_text2im_64
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=1
fp_item=fp32
run_mode=DP1-MP1-PP1
device_num=N1C1
yaml_path=ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml

model=imagen
micro_bs=1

cd ./benchmarks
bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh
# run
bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/N1C1/imagen_SR256_bs1_fp32_DP1-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=imagen_SR256
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=1
fp_item=fp32
run_mode=DP1-MP1-PP1
device_num=N1C1
yaml_path=ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml

model=imagen
micro_bs=1

cd ./benchmarks
bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh
# run
bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_2B_text2im_64_bs8_fp32_DP1-Sharding8.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=imagen_2B_text2im_64
dp_degree=1
mp_degree=1
pp_degree=1
bs_item=8
fp_item=fp32
run_mode=DP1-Sharding8
device_num=N1C8
yaml_path=ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml
max_iter=1000
sharding_degree=8
sharding_stage=2

model=imagen
micro_bs=1

cd ./benchmarks
bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh
# run
bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} ${max_iter} ${sharding_degree} ${sharding_stage} 2>&1;


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_397M_text2im_64_bs8_fp32_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=imagen_397M_text2im_64
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=8
fp_item=fp32
run_mode=DP8-MP1-PP1
device_num=N1C8
yaml_path=ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml

model=imagen
micro_bs=1

cd ./benchmarks
bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh
# run
bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_SR256_bs8_fp32_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=imagen_SR256
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=8
fp_item=fp32
run_mode=DP8-MP1-PP1
device_num=N1C8
yaml_path=ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml

model=imagen
micro_bs=1

cd ./benchmarks
bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh
# run
bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_text2im_64_debertav2_bs8_fp32_DP8-MP1-PP1.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model_item=imagen_text2im_64_debertav2
dp_degree=8
mp_degree=1
pp_degree=1
bs_item=8
fp_item=fp32
run_mode=DP8-MP1-PP1
device_num=N1C8
yaml_path=ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml

model=imagen
micro_bs=1

cd ./benchmarks
bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh
# run
bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${yaml_path} 2>&1;


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/benchmark_common/prepare.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
wget -O projects/imagen/part-00079 https://paddlefleetx.bj.bcebos.com/data/laion400m/part-00079
# T5-11B
mkdir -p projects/imagen/t5/t5-11b/ && cd projects/imagen/t5/t5-11b/
wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/config.json
wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/spiece.model
wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/tokenizer.json
wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.0
wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.1
wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.2
wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.3
wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.4
cat t5.pd.tar.gz.* |tar -xf -
cd -
# DeBERTa V2 1.5B
mkdir -p projects/imagen/cache/deberta-v-xxlarge && cd projects/imagen/cache/deberta-v-xxlarge
wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/config.json
wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/spm.model
wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/tokenizer_config.json
wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.0
wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.1
cat debertav2.pd.tar.gz.* | tar -xf -
cd -


================================================
FILE: benchmarks/test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    dp_degree=${3:-"1"}             # (必选) dp数据并行度
    mp_degree=${4:-"1"}             # (必选) mp数据并行度
    pp_degree=${5:-"1"}             # (必选) pp数据并行度
    micro_batch_size=${6:-"2"}      # (必选) micro_batch_size
    global_batch_size=${7:-"16"}    # （必选）global_batch_size
    run_mode=${8:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${9:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    yaml_path=${10:-"ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml"}
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="step/s"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="speed:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    max_iter=${11:-1000}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    sharding_degree=${12:-"1"}      # (可选)
    sharding_stage=${13:-"1"}       # (可选)sharding case
    sharding_offload=${14:-"False"} # (可选)
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}`
    train_cmd="-o Engine.max_steps=${max_iter} \
               -o Global.local_batch_size=${local_batch_size} \
               -o Global.micro_batch_size=${micro_batch_size} \
               -o Distributed.dp_degree=${dp_degree} \
               -o Distributed.mp_degree=${mp_degree} \
               -o Distributed.pp_degree=${pp_degree} \
               -o Distributed.sharding.sharding_degree=${sharding_degree} \
               -o Distributed.sharding.sharding_stage=${sharding_stage} \
               -o Distributed.sharding.sharding_offload=${sharding_offload} \
               "
    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi

    # 以下为通用执行命令，无特殊可不用修改
    case ${run_mode} in
    DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 \
            ${PADDLE_RANK_OPTION} tools/train.py -c ${yaml_path} \
            ${train_cmd}"
        workerlog_id=0
        ;;
    DP8-MP1-PP1|DP1-Sharding8) echo "run run_mode: ${run_mode}"
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 \
            ${PADDLE_RANK_OPTION} tools/train.py -c ${yaml_path} \
            ${train_cmd}"
        workerlog_id=0
        ;;
    *) echo "choose run_mode "; exit 1;
    esac
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 30m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/vit/dygraph/finetune/N1C8/ViT_large_patch16_384_ft_fused_False_bs512_fp16_DP.sh
================================================
model_item=ViT_large_patch16_384_ft_fused_False
fp_item=fp16
bs_item=512
run_mode=DP
device_num=N1C8
use_fused_attn=False
max_iter=1


cd ./benchmarks
bash ./test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \
${use_fused_attn} ${max_iter} 2>&1;


================================================
FILE: benchmarks/test_tipc/vit/dygraph/finetune/N1C8/ViT_large_patch16_384_ft_fused_True_bs512_fp16_DP.sh
================================================
model_item=ViT_large_patch16_384_ft_fused_True
fp_item=fp16
bs_item=512
run_mode=DP
device_num=N1C8
use_fused_attn=True
max_iter=1


cd ./benchmarks
bash ./test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh
# run
bash ./test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \
${use_fused_attn} ${max_iter} 2>&1;


================================================
FILE: benchmarks/test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
mkdir dataset && cd dataset
cp -r ${BENCHMARK_ROOT}/models_data_cfs/Paddle_distributed/ILSVRC2012.tgz ./
tar -zxf ILSVRC2012.tgz
cd -

# pretrained
mkdir -p pretrained/vit/
wget -O ./pretrained/vit/imagenet21k-ViT-L_16.pdparams \
https://paddle-wheel.bj.bcebos.com/benchmark/imagenet21k-ViT-L_16.pdparams


================================================
FILE: benchmarks/test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    global_batch_size=${3:-"128"}    # （必选）global_batch_size
    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="images/sec"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    use_fused_attn=${6:-"False"}
    max_iter=${7:-1}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    train_cmd="-o Engine.num_train_epochs=${max_iter} \
               -o Model.model.use_fused_attn=${use_fused_attn} \
               "
    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION} \
        tools/train.py -c ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml \
        ${train_cmd}"
    workerlog_id=0
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 15m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: benchmarks/test_tipc/vit/dygraph/pretrained/N2C16/ViT_large_patch16_224_pt_fused_False_bs128_fp16_DP.sh
================================================
model_item=ViT_large_patch16_224_pt_fused_False
fp_item=fp16
bs_item=128
run_mode=DP
device_num=N2C16
use_fused_attn=False
max_iter=1


cd ./benchmarks
bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh
# run
bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \
${use_fused_attn} ${max_iter} 2>&1;


================================================
FILE: benchmarks/test_tipc/vit/dygraph/pretrained/N2C16/ViT_large_patch16_224_pt_fused_True_bs128_fp16_DP.sh
================================================
model_item=ViT_large_patch16_224_pt_fused_True
fp_item=fp16
bs_item=128
run_mode=DP
device_num=N2C16
use_fused_attn=True
max_iter=1


cd ./benchmarks
bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh
# run
bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \
${use_fused_attn} ${max_iter} 2>&1;


================================================
FILE: benchmarks/test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m pip install -r ../requirements.txt
# get data
cd ../
mkdir dataset && cd dataset
cp -r ${BENCHMARK_ROOT}/models_data_cfs/Paddle_distributed/ILSVRC2012.tgz ./
tar -zxf ILSVRC2012.tgz
cd -


================================================
FILE: benchmarks/test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh
================================================
#!/usr/bin/env bash
# Test training benchmark for a model.
# Usage：bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    fp_item=${2:-"fp32"}            # (必选) fp32|fp16
    global_batch_size=${3:-"128"}    # （必选）global_batch_size
    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1
    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    yaml_path=${6:-"./task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml"}
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleFleetX"          # (必选) 模型套件的名字
    speed_unit="images/sec"         # (必选)速度指标单位
    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
    use_fused_attn=${7:-"False"}
    max_iter=${8:-1}                      # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
    num_workers=0                  # (可选)
    base_batch_size=$global_batch_size
    pretrained_model=${9:-"null"}
    # 以下为通用执行命令，无特殊可不用修改
    model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
    #
    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed

    OUTPUT_PATH=${run_log_path}/output
}

function _train(){
    batch_size=${local_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs

    if [ -d $OUTPUT_PATH ]; then
        rm -rf $OUTPUT_PATH
    fi
    mkdir $OUTPUT_PATH

    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"

    if [ ${profiling} = "true" ];then
        add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
        log_file=${profiling_log_file}
    else
        add_options=""
        log_file=${train_log_file}
    fi

    train_cmd="-o Engine.num_train_epochs=${max_iter} \
               -o Data.Train.sampler.batch_size=${global_batch_size} \
               -o Model.model.name=ViT_large_patch16_224 \
               -o Model.model.use_fused_attn=${use_fused_attn}
               "
    if [ ${PADDLE_TRAINER_ID} ]
    then
        PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
    else
        PADDLE_RANK_OPTION=""
    fi
    # 以下为通用执行命令，无特殊可不用修改
    train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION} \
        tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml \
        ${train_cmd}"
    workerlog_id=0
    cd ../
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间
        ${train_cmd} > ${log_file} 2>&1
    else
        timeout 15m ${train_cmd} > ${log_file} 2>&1
    fi
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
    else
        echo -e "${model_name}, SUCCESS"
    fi
    #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.${workerlog_id} ${log_file}
    fi
}

export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH

source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train       # 如果只产出训练log,不解析,可取消注释
_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开


================================================
FILE: codestyle/.gitignore
================================================
*.pyc


================================================
FILE: codestyle/clang_format.hook
================================================
#!/bin/bash
set -e

readonly VERSION="13.0.0"

version=$(clang-format -version)

if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
    echo "clang-format installation by pip need python version great equal 3.6, 
          please change the default python to higher version."
    exit 1
fi

if ! [[ $version == *"$VERSION"* ]]; then
    # low version of pip may not have the source of clang-format whl
    pip install --upgrade pip 
    pip install clang-format==13.0.0
fi

clang-format $@


================================================
FILE: codestyle/copyright.hook
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import io
import re
import sys
import os
import datetime

COPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.'''

def _generate_copyright(comment_mark):
    copyright=COPYRIGHT.split(os.linesep)
    header = copyright[0].rstrip()

    p = re.search('(\d{4})', header).group(0)
    now = datetime.datetime.now()

    header = header.replace(p,str(now.year))

    ans=[comment_mark + " " + header + os.linesep]
    for idx, line in enumerate(copyright[1:]):
        ans.append(comment_mark + " " + line.rstrip() + os.linesep)

    return ans

def _get_comment_mark(path):
    lang_type=re.compile(r"\.(py|sh)$")
    if lang_type.search(path) is not None:
        return "#"

    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
    if lang_type.search(path) is not None:
        return "//"

    return None


RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")

def _check_copyright(path):
    head=[]
    try:
        with open(path) as f:
            head = [next(f) for x in range(4)]
    except StopIteration:
        pass

    for idx, line in enumerate(head):
        if RE_COPYRIGHT.search(line) is not None:
            return True

    return False

def generate_copyright(path, comment_mark):
    original_contents = io.open(path, encoding="utf-8").readlines()
    head = original_contents[0:4]

    insert_line_no=0
    for i, line in enumerate(head):
        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
            insert_line_no=i+1

    copyright = _generate_copyright(comment_mark)
    if insert_line_no == 0:
        new_contents = copyright
        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
            new_contents.append(os.linesep)
        new_contents.extend(original_contents)
    else:
        new_contents=original_contents[0:insert_line_no]
        new_contents.append(os.linesep)
        new_contents.extend(copyright)
        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
            new_contents.append(os.linesep)
        new_contents.extend(original_contents[insert_line_no:])
    new_contents="".join(new_contents)

    with io.open(path, 'w') as output_file:
        output_file.write(new_contents)


def main(argv=None):
    parser = argparse.ArgumentParser(
        description='Checker for copyright declaration.')
    parser.add_argument('filenames', nargs='*', help='Filenames to check')
    args = parser.parse_args(argv)

    retv = 0
    for path in args.filenames:
        comment_mark = _get_comment_mark(path)
        if comment_mark is None:
            print("warning:Unsupported file", path, file=sys.stderr)
            continue

        if _check_copyright(path):
            continue

        generate_copyright(path, comment_mark)


if __name__ == '__main__':
    exit(main())


================================================
FILE: codestyle/cpplint_pre_commit.hook
================================================
#!/bin/bash

TOTAL_ERRORS=0

readonly VERSION="1.6.0"

version=$(cpplint --version)

if [[ ! $TRAVIS_BRANCH ]]; then
  # install cpplint on local machine.
  if ! [[ $version == *"$VERSION"* ]]; then
    pip install cpplint==1.6.0
  fi
  # diff files on local machine. 
  files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
else
  # diff files between PR and latest commit on Travis CI. 
  branch_ref=$(git rev-parse "$TRAVIS_BRANCH")
  head_ref=$(git rev-parse HEAD)
  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}')
fi
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $files; do
    if [[ $file =~ ^(patches/.*) ]]; then
        continue;
    else
        cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens $file;
        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
    fi
done

exit $TOTAL_ERRORS


================================================
FILE: codestyle/docstring_checker.py
================================================
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DocstringChecker is used to check python doc string's style."""

import astroid

from pylint.checkers import BaseChecker, utils
from pylint.interfaces import IAstroidChecker

from collections import defaultdict
import re


def register(linter):
    """Register checkers."""
    linter.register_checker(DocstringChecker(linter))


class Docstring(object):
    """Docstring class holds the parsed doc string elements.
    """

    def __init__(self):
        self.d = defaultdict(list)  #name->[]
        self.clear()

    def clear(self):
        self.d['Args'] = []
        self.d['Examples'] = []
        self.d['Returns'] = []
        self.d['Raises'] = []
        self.args = {}  #arg_name->arg_type

    def get_level(self, string, indent='    '):
        level = 0
        unit_size = len(indent)
        while string[:unit_size] == indent:
            string = string[unit_size:]
            level += 1

        return level

    def parse(self, doc):
        """parse gets sections from doc
        Such as Args, Returns, Raises, Examples s
        Args:
            doc (string): is the astroid node doc string.
        Returns:
            True if doc is parsed successfully.
        """
        self.clear()

        lines = doc.splitlines()
        state = ("others", -1)
        for l in lines:
            c = l.strip()
            if len(c) <= 0:
                continue

            level = self.get_level(l)
            if c.startswith("Args:"):
                state = ("Args", level)
            elif c.startswith("Returns:"):
                state = ("Returns", level)
            elif c.startswith("Raises:"):
                state = ("Raises", level)
            elif c.startswith("Examples:"):
                state = ("Examples", level)
            else:
                if level > state[1]:
                    self.d[state[0]].append(c)
                    continue

                state = ("others", -1)
                self.d[state[0]].append(c)

        self._arg_with_type()
        return True

    def get_returns(self):
        return self.d['Returns']

    def get_raises(self):
        return self.d['Raises']

    def get_examples(self):
        return self.d['Examples']

    def _arg_with_type(self):

        for t in self.d['Args']:
            m = re.search(r'([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
            if m:
                self.args[m.group(1)] = m.group(2)

        return self.args


class DocstringChecker(BaseChecker):
    """DosstringChecker is pylint checker to
    check docstring style.
    """
    __implements__ = (IAstroidChecker, )

    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'

    name = 'doc-string-checker'
    symbol = "doc-string"
    priority = -1
    msgs = {
        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
                  'Used when a short doc string is on multiple lines'),
        'W9002':
        ('Doc string does not end with "." period', symbol + "-end-with",
         'Used when a doc string does not end with a period'),
        'W9003':
        ('All args with their types must be mentioned in doc string %s',
         symbol + "-with-all-args",
         'Used when not all arguments are in the doc string '),
        'W9005': ('Missing docstring or docstring is too short',
                  symbol + "-missing", 'Add docstring longer >=10'),
        'W9006': ('Docstring indent error, use 4 space for indent',
                  symbol + "-indent-error", 'Use 4 space for indent'),
        'W9007': ('You should add `Returns` in comments',
                  symbol + "-with-returns",
                  'There should be a `Returns` section in comments'),
        'W9008': ('You should add `Raises` section in comments',
                  symbol + "-with-raises",
                  'There should be a `Raises` section in comments'),
    }
    options = ()

    def visit_functiondef(self, node):
        """visit_functiondef checks Function node docstring style.
        Args:
            node (astroid.node): The visiting node.
        Returns:
            True if successful other wise False.
        """

        self.check_doc_string(node)

        if node.tolineno - node.fromlineno <= 10:
            return True

        if not node.doc:
            return True

        doc = Docstring()
        doc.parse(node.doc)

        self.all_args_in_doc(node, doc)
        self.with_returns(node, doc)
        self.with_raises(node, doc)

    def visit_module(self, node):
        self.check_doc_string(node)

    def visit_classdef(self, node):
        self.check_doc_string(node)

    def check_doc_string(self, node):
        self.missing_doc_string(node)
        self.one_line(node)
        self.has_period(node)
        self.indent_style(node)

    def missing_doc_string(self, node):
        if node.name.startswith("__") or node.name.startswith("_"):
            return True
        if node.tolineno - node.fromlineno <= 10:
            return True

        if node.doc is None or len(node.doc) < 10:
            self.add_message('W9005', node=node, line=node.fromlineno)
        return False

    # FIXME(gongwb): give the docstring line-no
    def indent_style(self, node, indent=4):
        """indent_style checks docstring's indent style
        Args:
            node (astroid.node): The visiting node.
            indent (int): The default indent of style
        Returns:
            True if successful other wise False.
        """
        if node.doc is None:
            return True

        doc = node.doc
        lines = doc.splitlines()
        line_num = 0

        for l in lines:
            if line_num == 0:
                continue
            cur_indent = len(l) - len(l.lstrip())
            if cur_indent % indent != 0:
                self.add_message('W9006', node=node, line=node.fromlineno)
                return False
            line_num += 1

        return True

    def one_line(self, node):
        """one_line checks if docstring (len < 40) is on one line.
        Args:
            node (astroid.node): The node visiting.
        Returns:
            True if successful otherwise False.
        """

        doc = node.doc
        if doc is None:
            return True

        if len(doc) > 40:
            return True
        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
            return True
        else:
            self.add_message('W9001', node=node, line=node.fromlineno)
            return False

        return True

    def has_period(self, node):
        """has_period checks if one line doc end-with '.' .
        Args:
            node (astroid.node): the node is visiting.
        Returns:
            True if successful otherwise False.
        """
        if node.doc is None:
            return True

        if len(node.doc.splitlines()) > 1:
            return True

        if not node.doc.strip().endswith('.'):
            self.add_message('W9002', node=node, line=node.fromlineno)
            return False

        return True

    def with_raises(self, node, doc):
        """with_raises checks if one line doc end-with '.' .
        Args:
            node (astroid.node): the node is visiting.
            doc (Docstring): Docstring object.
        Returns:
            True if successful otherwise False.
        """

        find = False
        for t in node.body:
            if not isinstance(t, astroid.Raise):
                continue

            find = True
            break

        if not find:
            return True

        if len(doc.get_raises()) == 0:
            self.add_message('W9008', node=node, line=node.fromlineno)
            return False

        return True

    def with_returns(self, node, doc):
        """with_returns checks if docstring comments what are returned .
        Args:
            node (astroid.node): the node is visiting.
            doc (Docstring): Docstring object.
        Returns:
            True if successful otherwise False.
        """

        if node.name.startswith("__") or node.name.startswith("_"):
            return True
        find = False
        for t in node.body:
            if not isinstance(t, astroid.Return):
                continue

            find = True
            break

        if not find:
            return True

        if len(doc.get_returns()) == 0:
            self.add_message('W9007', node=node, line=node.fromlineno)
            return False

        return True

    def all_args_in_doc(self, node, doc):
        """all_args_in_doc checks if arguments are mentioned in doc
        Args:
            node (astroid.node): the node is visiting.
            doc (Docstring): Docstring object
        Returns:
            True if successful otherwise False.
        """
        if node.name.startswith("__") or node.name.startswith("_"):
            return True
        args = []
        for arg in node.args.get_children():
            if (not isinstance(arg, astroid.AssignName)) \
                or arg.name == "self":
                continue
            args.append(arg.name)

        if len(args) <= 0:
            return True

        parsed_args = doc.args
        args_not_documented = set(args) - set(parsed_args)
        if len(args) > 0 and len(parsed_args) <= 0:
            self.add_message(
                'W9003',
                node=node,
                line=node.fromlineno,
                args=list(args_not_documented))
            return False

        for t in args:
            if t not in parsed_args:
                self.add_message(
                    'W9003', node=node, line=node.fromlineno, args=[t, ])
                return False

        return True


================================================
FILE: codestyle/pylint_pre_commit.hook
================================================
#!/bin/bash

TOTAL_ERRORS=0


DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export PYTHONPATH=$DIR:$PYTHONPATH

readonly VERSION="2.12.0"
version=$(pylint --version | grep 'pylint')

if ! [[ $version == *"$VERSION"* ]]; then
    pip install pylint==2.12.0
fi

# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
    pylint --disable=all --load-plugins=docstring_checker \
    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done

exit $TOTAL_ERRORS
#For now, just warning:
#exit 0
Footer


================================================
FILE: codestyle/test_docstring_checker.py
================================================
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import docstring_checker
import pylint.testutils
import astroid
import pytest
import sys


class TestDocstring(pylint.testutils.CheckerTestCase):
    CHECKER_CLASS = docstring_checker.DocstringChecker

    def test_one_line(self):
        func_node = astroid.extract_node('''
        def test(): 
            """get 
            news.
            """
            if True:
                return 5
            return 5
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9001' == got[0][0]

    def test_one_line_1(self):
        func_node = astroid.extract_node('''
        def test(): 
            """get news"""
            if True:
                return 5
            return 5
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9002' == got[0][0]

    def test_args(self):
        func_node = astroid.extract_node('''
        def test(scale, mean): 
            """get news.
            Args:
                scale (int): scale is the number.
            """
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9003' == got[0][0]

    def test_missing(self):
        func_node = astroid.extract_node('''
        def test(): 
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9005' == got[0][0]

    def test_indent(self):
        func_node = astroid.extract_node('''
        def test(): 
            """ get get get get get get get get
              get get get get get get get get.
            """
            pass 
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9006' == got[0][0]

    def test_with_resturns(self):
        func_node = astroid.extract_node('''
        def test(): 
            """get news.
            Args:
                scale (int): scale is the number.
            """
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            return mean
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9007' == got[0][0]

    def test_with_raises(self):
        func_node = astroid.extract_node('''
        def test(): 
            """get news.
            Args:
                scale (int): scale is the number.
            """
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            mean=scale
            raise ValueError('A very specific bad thing happened.')
        ''')

        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 1
        assert 'W9008' == got[0][0]

    def test_no_message(self):
        p = '''
def fc(input,
       size,
       num_flatten_dims=1,
       param_attr=None,
       bias_attr=None,
       act=None,
       name=None):
    """
    **Fully Connected Layer**
    The fully connected layer can take multiple tensors as its inputs. It
    creates a variable called weights for each input tensor, which represents
    a fully connected weight matrix from each input unit to each output unit.
    The fully connected layer multiplies each input tensor with its coresponding
    weight to produce an output Tensor. If multiple input tensors are given,
    the results of multiple multiplications will be sumed up. If bias_attr is
    not None, a bias variable will be created and added to the output. Finally,
    if activation is not None, it will be applied to the output as well.
    This process can be formulated as follows:
    Args:
        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
            the input tensor(s) is at least 2.
        size(int): The number of output units in this layer.
        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
            two dimensions. If this happens, the multidimensional tensor will first be flattened
            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
            dimensions will be flatten to form the first dimension of the final matrix (height of
            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
            form the second dimension of the final matrix (width of the matrix). For example, suppose
            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
            parameters/weights of this layer.
        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
            of this layer. If it is set to None, no bias will be added to the output units.
        act (str, default None): Activation to be applied to the output of this layer.
        name (str, default None): The name of this layer.
    Returns:
        A tensor variable storing the transformation result.
    Raises:
        ValueError: If rank of the input tensor is less than 2.
    Examples:
        .. code-block:: python
            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
    """
    raise ValueError('A very specific bad thing happened.')
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    size = 1
    return size
    '''

        func_node = astroid.extract_node(p)
        self.checker.visit_functiondef(func_node)
        got = self.linter.release_messages()
        assert len(got) == 0


================================================
FILE: docs/cluster_deployment.md
================================================

## 集群部署

本文档介绍在集群上使用分布式进行大模型训练的方法，包括在 Kubernetes 上使用 PaddlePaddle 分布式和在云上使用的方法。

### 1. Kubernetes部署

在 Kubernetes 上部署分布式任务需要安装 [paddle-operator](https://github.com/PaddleFlow/paddle-operator) 。

paddle-operator 通过添加自定义资源类型 (paddlejob) 以及部署 controller 和一系列 Kubernetes 原生组件的方式实现简单定义即可运行 PaddlePaddle 任务的需求。

目前支持运行 ParameterServer (PS) 和 Collective 两种分布式任务，当然也支持运行单节点任务。

**paddle-operator 安装**

安装 paddle-operator 需要有已经安装的 Kubernetes (v1.16+) 集群和 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (v1.16+) 工具。

本节所需配置文件和示例可以在 [这里](https://github.com/PaddleFlow/paddle-operator/tree/main/deploy) 找到，
可以通过 *git clone* 或者复制文件内容保存。

```yaml
deploy
|-- examples
|   |-- resnet.yaml
|   |-- wide_and_deep.yaml
|   |-- wide_and_deep_podip.yaml
|   |-- wide_and_deep_service.yaml
|   `-- wide_and_deep_volcano.yaml
|-- v1
|   |-- crd.yaml
|   `-- operator.yaml
```

执行以下命令，

```shell
kubectl create -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/dev/deploy/v1/crd.yaml
```

或者

```shell
kubectl create -f deploy/v1/crd.yaml
```

通过以下命令查看是否成功，

```shell
kubectl get crd
NAME                                    CREATED AT
paddlejobs.batch.paddlepaddle.org       2021-02-08T07:43:24Z
```

执行以下部署命令，

```shell
kubectl create -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/dev/deploy/v1/operator.yaml
```

或者

```shell
kubectl create -f deploy/v1/operator.yaml
```

通过以下命令查看部署结果和运行状态，

```shell
kubectl -n paddle-system get pods
NAME                                         READY   STATUS    RESTARTS   AGE
paddle-controller-manager-698dd7b855-n65jr   1/1     Running   0          1m
```

通过查看 controller 日志以确保运行正常，

```shell
kubectl -n paddle-system logs paddle-controller-manager-698dd7b855-n65jr
```

提交 demo 任务查看效果，

```shell
kubectl -n paddle-system create -f deploy/examples/wide_and_deep.yaml
```

查看 paddlejob 任务状态, pdj 为 paddlejob 的缩写，

```shell
kubectl -n paddle-system get pdj
NAME                     STATUS      MODE   AGE
wide-ande-deep-service   Completed   PS     4m4s
```

以上信息可以看出：训练任务已经正确完成，该任务为 ps 模式。
可通过 cleanPodPolicy 配置任务完成/失败后的 pod 删除策略，详见任务配置。

训练期间可以通过如下命令查看 pod 状态，

```shell
kubectl -n paddle-system get pods
```

**paddlejob 任务提交**

本resnet示例为 Collective 模式，使用 GPU 进行训练，只需要配置 worker，worker 配置中需要声明使用的 GPU 信息。

准备配置文件，

```yaml
apiVersion: batch.paddlepaddle.org/v1
kind: PaddleJob
metadata:
  name: resnet
spec:
  cleanPodPolicy: Never
  worker:
    replicas: 2
    template:
      spec:
        containers:
          - name: paddle
            image: registry.baidubce.com/paddle-operator/demo-resnet:v1
            command:
            - python
            args:
            - "-m"
            - "paddle.distributed.launch"
            - "train_fleet.py"
            volumeMounts:
            - mountPath: /dev/shm
              name: dshm
            resources:
              limits:
                nvidia.com/gpu: 1
        volumes:
        - name: dshm
          emptyDir:
            medium: Memory
```

注意：

* 这里需要添加 shared memory 挂载以防止缓存出错。
* 本示例采用内置 flower 数据集，程序启动后会进行下载，根据网络环境可能等待较长时间。

提交任务: 使用 kubectl 提交 yaml 配置文件以创建任务，

```shell
kubectl -n paddle-system create -f resnet.yaml
```

**卸载**

通过以下命令卸载部署的组件，

```shell
kubectl delete -f deploy/v1/crd.yaml -f deploy/v1/operator.yaml
```

*注意：重新安装时，建议先卸载再安装*

### 2. 公有云和私有云部署

在公有云上运行 PaddlePaddle 分布式建议通过选购容器引擎服务的方式，各大云厂商都推出了基于标准 Kubernetes 的云产品，然后根据上节中的教程安装使用即可。

| 云厂商 | 容器引擎 | 链接                                           |
| --- | ---- | -------------------------------------------- |
| 百度云 | CCE  | https://cloud.baidu.com/product/cce.html     |
| 阿里云 | ACK  | https://help.aliyun.com/product/85222.html   |
| 华为云 | CCE  | https://www.huaweicloud.com/product/cce.html |

更为方便的是使用百度提供的全功能AI开发平台 [BML](https://cloud.baidu.com/product/bml) 来使用，详细的使用方式请参考 [BML文档](https://ai.baidu.com/ai-doc/BML/pkhxhgo5v)。


================================================
FILE: docs/compression.md
================================================
# 模型压缩

------------------------------------------------------------------------------------------

## **简介**

PaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法：量化训练（Qutization Aware Training，QAT）、结构化稀疏（Structured Pruning，SP）和知识蒸馏（Knowledge Distillation，KD）。本文会介绍如何在 PaddleFleetX 中使用这些功能，来压缩并且导出压缩后的模型。

## **特性**

- <a href=https://github.com/PaddlePaddle/PaddleSlim/tree/release/2.4/demo/dygraph/quant>量化训练</a>：通过将全连接层的矩阵乘计算由 Float 浮点型优化为 INT8 整型来优化推理性能；
- <a href=https://github.com/PaddlePaddle/PaddleSlim/tree/release/2.4/demo/dygraph/pruning>结构化稀疏</a>：通过剪裁全连接层权重的通道数目来优化推理性能；
- <a href=#知识蒸馏>知识蒸馏</a>：通过使用高精度的大模型（教师模型）来蒸馏低精度的小模型（学生模型）来提升小模型精度


## **配置文档**

模型压缩开关通过 Compress 字段控制，预训练的模型参数路径由 pretrained 指定。接下来就是量化训练、结构化稀疏和知识蒸馏各自的技术参数。

```yaml
Compress:
  pretrained:         // 预训练模型参数的保存路径
  
  Quantization:       // 量化训练参数
    
  Prune:              // 结构化稀疏参数
  
  Distillation:       // 知识蒸馏参数
```

**注意**： 我们正在开发上述三种压缩方法的联合使用，请先单独使用上述各个方法。

### **量化训练参数**

```yaml
Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_preprocess_type: None
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
```

其中参数说明：

| **参数名**                   | **参数释义**                              |
|-----------------------------|-----------------------------------------|
| pretrained                  | 预训练模型的加载目录，若设置该参数，将在量化之前加载预训练模型；若需要加载量化后参数，将此参数设置为None，直接设置Engine.save_load.ckpt_dir即可       |
| enable                      | 是否开启量化训练                           |
| weight_quantize_type        | weight量化方法, 默认为`channel_wise_abs_max`, 此外还支持`abs_max` |
| activation_quantize_type    | activation量化方法, 默认为`moving_average_abs_max`               |
| weight_preprocess_type      | weight预处理方法，默认为None，代表不进行预处理；当需要使用`PACT`方法时设置为`PACT` |
| activation_preprocess_type  | activation预处理方法，默认为None，代表不进行预处理                   |
| weight_bits                 | weight量化比特数, 默认为 8                                        |
| activation_bits             | activation量化比特数, 默认为 8                                    |
| quantizable_layer_type      | 需要量化的算子类型                                                |
| onnx_format                 | 是否使用新量化格式，默认为False                                     |

更详细的量化训练参数介绍可参考[PaddleSlim动态图量化训练接口介绍](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/dygraph/quanter/qat.rst)。

### **结构化稀疏参数**

```yaml
Compress:
  pretrained:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125
```

其中参数说明：

| **参数名**                   | **参数释义**                              |
|-----------------------------|-----------------------------------------|
| pretrained                  | 预训练模型的加载目录       |
| enable                      | 是否开启结构化稀疏训练                           |
| criterion    | 权重的重要性指标，目前支持l1_norm 和 l2_norm|
| ratio      | 权重稀疏的比例。例如，0.125的意思是12.5%的权重会被稀疏掉 |


================================================
FILE: docs/deployment_faq.md
================================================
## 环境验证和常见问题

本文为环境问题排查指引，包括环境正确性验证的方法和常见的一些问题解决方法。

### 1. 单机环境验证

以下验证不区分本机环境和 Docker 环境。

**GPU验证**

当使用 GPU 时，使用 `nvidia-smi` 命令查看环境中 GPU 状态，预期输出如下

```shell
Thu Jul 21 19:32:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:3F:00.0 Off |                    0 |
| N/A   33C    P0    40W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:40:00.0 Off |                    0 |
| N/A   34C    P0    41W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   35C    P0    41W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-SXM2...  On   | 00000000:42:00.0 Off |                    0 |
| N/A   38C    P0    42W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   4  Tesla V100-SXM2...  On   | 00000000:62:00.0 Off |                    0 |
| N/A   34C    P0    39W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   5  Tesla V100-SXM2...  On   | 00000000:63:00.0 Off |                    0 |
| N/A   36C    P0    40W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   6  Tesla V100-SXM2...  On   | 00000000:64:00.0 Off |                    0 |
| N/A   37C    P0    41W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   7  Tesla V100-SXM2...  On   | 00000000:65:00.0 Off |                    0 |
| N/A   36C    P0    39W / 300W |      0MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
```

结果中可以看出

* CUDA Version栏显示的是当前环境中的CUDA版本号，此处为11.2。开始使用飞桨前，请先保证此处CUDA Version显示正常。如果CUDA Version栏不显示版本号，则需要添加CUDA相关库的路径到环境变量`LD_LIBRARY_PATH`中，例如执行命令添加 `export LD_LIBRARY_PATH=/usr/lib64/:/usr/local/lib/:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}` 。具体请参考[文档](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)。
* Memory-Usage 列显示的是当前的显存占用值，此处为0MiB，表示当前设备的显存未被占用；GPU-Util 列显示的是当前的GPU利用率，此处为0%，表示当前设备空闲，可以使用。开始使用飞桨前，请保证当前设备显存充足，且利用率处于空闲状态。
* 最后的 Processes 信息表示正在使用设备的进程，Docker 内可能存在不准确的情况，不影响使用。

**PaddlePaddle 安装验证**

首先运行如下命令确保 PaddlePaddle 正确安装

```shell
python -c "import paddle; paddle.utils.run_check()"
```

预期会有如下输出

```shell
Running verify PaddlePaddle program ... 
W0720 09:29:22.035640 12791 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0720 09:29:22.040702 12791 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.
PaddlePaddle works well on 1 GPU.
W0720 09:29:36.763486 12791 fuse_all_reduce_op_pass.cc:79] Find all_reduce operators: 2. To make the speed faster, some all_reduce ops are fused during training, after fusion, the number of all_reduce ops is 2.
PaddlePaddle works well on 8 GPUs.
PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now.
```

表示 PaddlePaddle 已经正确安装。

如果出现以下错误信息请确保 CUDA 安装正确且已根据 CUDA 安装路径正确配置的 LD_LIBRARY_PATH。
例如执行命令添加 `export LD_LIBRARY_PATH=/usr/lib64/:/usr/local/lib/:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}` 。
具体请参考[文档](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)。

```
You are using GPU version Paddle, but your CUDA device is not set properly.
```

### 2. 分布式环境验证

如果单机运行正常，但多机分布式运行异常请先根据 [网络问题排查](#31-网络问题排查) 部分排查网络问题再进行以下排查。

请先确保**各个机器**的 PaddlePaddle 环境已经正确安装，然后在等待验证的其中一个节点上运行如下命令

```shell
python -m paddle.distributed.launch run_check
```

> 默认验证 2 机分布式环境，如果需要验证更多机器（例如4个）环境下飞桨分布式是否运行正常，请添加节点数参数 --nnodes，具体命令如下：
> 
> `python -m paddle.distributed.launch --nnodes=4 run_check`

预期输出如下

```shell
LAUNCH INFO 2022-07-20 09:38:33,349 PaddlePaddle Distributed Check begin...
LAUNCH INFO 2022-07-20 09:38:33,358 -----------  Configuration  ----------------------
LAUNCH INFO 2022-07-20 09:38:33,358 devices: None
LAUNCH INFO 2022-07-20 09:38:33,358 elastic_level: -1
LAUNCH INFO 2022-07-20 09:38:33,358 elastic_timeout: 30
LAUNCH INFO 2022-07-20 09:38:33,358 gloo_port: 6767
LAUNCH INFO 2022-07-20 09:38:33,358 host: None
LAUNCH INFO 2022-07-20 09:38:33,358 job_id: default
LAUNCH INFO 2022-07-20 09:38:33,358 legacy: False
LAUNCH INFO 2022-07-20 09:38:33,358 log_dir: log
LAUNCH INFO 2022-07-20 09:38:33,358 log_level: ERROR
LAUNCH INFO 2022-07-20 09:38:33,358 master: None
LAUNCH INFO 2022-07-20 09:38:33,358 max_restart: 3
LAUNCH INFO 2022-07-20 09:38:33,358 nnodes: 2
LAUNCH INFO 2022-07-20 09:38:33,358 nproc_per_node: None
LAUNCH INFO 2022-07-20 09:38:33,358 rank: -1
LAUNCH INFO 2022-07-20 09:38:33,358 run_mode: collective
LAUNCH INFO 2022-07-20 09:38:33,359 server_num: None
LAUNCH INFO 2022-07-20 09:38:33,359 servers: 
LAUNCH INFO 2022-07-20 09:38:33,359 trainer_num: None
LAUNCH INFO 2022-07-20 09:38:33,359 trainers: 
LAUNCH INFO 2022-07-20 09:38:33,359 training_script: /usr/local/lib/python3.7/dist-packages/paddle/distributed/launch/plugins/test.py
LAUNCH INFO 2022-07-20 09:38:33,359 training_script_args: []
LAUNCH INFO 2022-07-20 09:38:33,359 with_gloo: 1
LAUNCH INFO 2022-07-20 09:38:33,359 --------------------------------------------------
LAUNCH INFO 2022-07-20 09:38:33,360 Job: default, mode collective, replicas 2[2:2], elastic False
LAUNCH INFO 2022-07-20 09:38:33,367 Waiting peer start...
Copy the following command to other nodes to run.
--------------------------------------------------------------------------------
python -m paddle.distributed.launch --master 10.10.1.1:49178 run_check
--------------------------------------------------------------------------------
```

> 如果当前安装的 PaddlePaddle 中未包含该工具，请根据上节提示安装 develop 版本进行测试。

根据提示，复制最后的命令（复制机器上个命令的执行结果，以下命令为示例），在其他节点上粘贴执行

```shell
python -m paddle.distributed.launch --master 10.10.1.1:49178 run_check
```

执行后，如果配置正常则每个节点都会有后续输出

```shell
LAUNCH INFO 2022-07-20 09:46:41,571 Run Pod: xqqbsr, replicas 2, status ready
LAUNCH INFO 2022-07-20 09:46:41,601 Watching Pod: xqqbsr, replicas 2, status running
Prepare distributed training with 2 nodes 2 cards
I0720 09:46:43.583846 13375 tcp_utils.cc:181] The server starts to listen on IP_ANY:14863
I0720 09:46:43.584153 13375 tcp_utils.cc:130] Successfully connected to 10.10.10.1:14863
W0720 09:46:47.089151 13375 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0720 09:46:47.098454 13375 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.
2022-07-20 09:46:51,333-INFO: [topology.py:187:__init__] HybridParallelInfo: rank_id: 0, mp_degree: 1, sharding_degree: 1, pp_degree: 1, dp_degree: 4, mp_group: [0],  sharding_group: [0], pp_group: [0], dp_group: [0, 1, 2, 3], check/clip group: [0]
Distributed training start...
[Epoch 0, batch 0] loss: 5.10316, acc1: 0.03125, acc5: 0.06250
Distributed training completed
I0720 09:46:54.828758 13432 tcp_store.cc:257] receive shutdown event and so quit from MasterDaemon run loop
LAUNCH INFO 2022-07-20 09:46:56,617 Pod completed
LAUNCH INFO 2022-07-20 09:46:57,085 Exit code 0
```

则表示分布式环境配置正常，多机分布式训练可以成功运行。

> 如果其他节点执行命令后各个节点没有后续输出或输出不符合预期请参考 [FAQ](#3-faq) 部分解决。

**实际分布式训练任务验证**

在启动分布式任务前需要确保各个节点上安装好 PaddlePaddle 环境，同步好数据和代码。

例如准备好训练代码 `train.py`，同步至每个训练节点的工作目录。

```python
import numpy as np
import paddle
from paddle.distributed import fleet
from paddle.vision.models import ResNet
from paddle.vision.models.resnet import BottleneckBlock
from paddle.io import Dataset, BatchSampler, DataLoader

base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4

epoch = 10
batch_num = 3
batch_size = 32
class_dim = 102

class RandomDataset(Dataset):
    def __init__(self, num_samples):
        self.num_samples = num_samples

    def __getitem__(self, idx):
        image = np.random.random([3, 224, 224]).astype('float32')
        label = np.random.randint(0, class_dim - 1, (1, )).astype('int64')
        return image, label

    def __len__(self):
        return self.num_samples

def optimizer_setting(parameter_list=None):
    optimizer = paddle.optimizer.Momentum(
        learning_rate=base_lr,
        momentum=momentum_rate,
        weight_decay=paddle.regularizer.L2Decay(l2_decay),
        parameters=parameter_list)
    return optimizer


def train_resnet():
    fleet.init(is_collective=True)

    resnet = ResNet(BottleneckBlock, 18, num_classes=class_dim)
    optimizer = optimizer_setting(parameter_list=resnet.parameters())
    optimizer = fleet.distributed_optimizer(optimizer)
    resnet = fleet.distributed_model(resnet)

    dataset = RandomDataset(batch_num * batch_size)
    train_loader = DataLoader(dataset,
                    batch_size=batch_size,
                    shuffle=True,
                    drop_last=True,
                    num_workers=2)

    for eop in range(epoch):
        resnet.train()

        for batch_id, data in enumerate(train_loader()):
            img, label = data
            label.stop_gradient = True

            out = resnet(img)
            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
            avg_loss = paddle.mean(x=loss)
            acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
            acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)

            avg_loss.backward()
            optimizer.step()
            resnet.clear_gradients()

            print("[Epoch %d, batch %d] loss: %.5f, acc1: %.5f, acc5: %.5f" % (eop, batch_id, avg_loss, acc_top1, acc_top5))

if __name__ == '__main__':
    train_resnet()
```

启动分布式训练的命令如下，
这个命令需要在每个参与训练的节点上执行（每个节点上的 `--master`都设置为同一个），如节点较多可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。

```python
python -m paddle.distributed.launch --master=10.10.1.1:49178 --nnodes=2 train.py
```

这里用到了分布式启动最重要的两个参数

- `--nnodes` 为分布式任务的节点个数（一般为参与任务的机器数量），默认为 1 即启动单机任务，也可使用环境变量 PADDLE_NNODES 设置。

- `--master` 为分布式信息同步的主节点地址，ip:port 格式，可以由第一个启动的节点自动打印或者直接由用户设置为参与任务的任意节点 ip 和任意可用端口，也可使用环境变量 PADDLE_MASTER 设置。

> master 支持使用 etcd 服务，当使用 etcd 服务时，需要同时指定任务 id 以避免任务间冲突。具体地，可以通过 --job_id 参数或者设置环境变量 PADDLE_JOB_ID 指定任务id。


启动后，将看到如下日志，首先是配置部分

```shell
LAUNCH INFO 2022-07-20 12:10:15,863 -----------  Configuration  ----------------------
LAUNCH INFO 2022-07-20 12:10:15,863 devices: None
LAUNCH INFO 2022-07-20 12:10:15,863 elastic_level: -1
LAUNCH INFO 2022-07-20 12:10:15,863 elastic_timeout: 30
LAUNCH INFO 2022-07-20 12:10:15,863 gloo_port: 6767
LAUNCH INFO 2022-07-20 12:10:15,863 host: None
LAUNCH INFO 2022-07-20 12:10:15,863 job_id: default
LAUNCH INFO 2022-07-20 12:10:15,863 legacy: False
LAUNCH INFO 2022-07-20 12:10:15,863 log_dir: log
LAUNCH INFO 2022-07-20 12:10:15,863 log_level: INFO
LAUNCH INFO 2022-07-20 12:10:15,863 master: 127.0.0.1:8890
LAUNCH INFO 2022-07-20 12:10:15,863 max_restart: 3
LAUNCH INFO 2022-07-20 12:10:15,863 nnodes: 2
LAUNCH INFO 2022-07-20 12:10:15,863 nproc_per_node: None
LAUNCH INFO 2022-07-20 12:10:15,863 rank: -1
LAUNCH INFO 2022-07-20 12:10:15,863 run_mode: collective
LAUNCH INFO 2022-07-20 12:10:15,863 server_num: None
LAUNCH INFO 2022-07-20 12:10:15,863 servers: 
LAUNCH INFO 2022-07-20 12:10:15,863 trainer_num: None
LAUNCH INFO 2022-07-20 12:10:15,863 trainers: 
LAUNCH INFO 2022-07-20 12:10:15,863 training_script: train.py
LAUNCH INFO 2022-07-20 12:10:15,863 training_script_args: []
LAUNCH INFO 2022-07-20 12:10:15,864 with_gloo: 1
LAUNCH INFO 2022-07-20 12:10:15,864 --------------------------------------------------
```

这里打印分布式启动时的配置信息， 更多 launch 启动参数和用法请参考 [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/launch_cn.html) 或通过以下命令获得。

```shell
python -m paddle.distributed.launch --help
```

然后打印的是任务启动相关的信息：

```shell
LAUNCH INFO 2022-07-20 12:10:15,864 Job: default, mode collective, replicas 2[2:2], elastic False
LAUNCH INFO 2022-07-20 12:10:15,870 Waiting peer start...
LAUNCH INFO 2022-07-20 12:10:25,860 Run Pod: bpdjev, replicas 2, status ready
LAUNCH INFO 2022-07-20 12:10:25,883 Watching Pod: bpdjev, replicas 2, status running
```

其中，每行对应的具体含义解释如下：

* 因为未设置 job_id，使用默认名称 default，启动的是 collective 模式，总共 2 个节点的分布式任务，不支持弹性（即节点数不可变）。
* 节点短暂处于等待其他节点启动的状态，如果其他节点已启动但日志长期处于等待状态，请根据 [FAQ](#31-网络问题排查) 进行排查。
* 任务准备启动，当前节点名为 bpdjev（该名称为随机生成）处于 ready 状态，当前节点包含 2 个进程（1 个进程对应 1 个 GPU）。
* 节点已启动，正在监控进程健康状态。

至此分布式启动成功，接下来打印业务日志（即用户代码相关输出日志）

```shell
I0720 12:10:27.763713 14071 tcp_utils.cc:181] The server starts to listen on IP_ANY:11061
I0720 12:10:27.763914 14071 tcp_utils.cc:130] Successfully connected to 10.10.10.1:11061
W0720 12:10:30.666985 14071 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0720 12:10:30.675815 14071 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1.
2022-07-20 12:10:36,377-INFO: [topology.py:187:**init**] HybridParallelInfo: rank_id: 0, mp_degree: 1, sharding_degree: 1, pp_degree: 1, dp_degree: 4, mp_group: [0], sharding_group: [0], pp_group: [0], dp_group: [0, 1, 2, 3], check/clip group: [0]
/usr/local/lib/python3.7/dist-packages/paddle/nn/layer/norm.py:668: UserWarning: When training, we now always track global mean and variance.
 "When training, we now always track global mean and variance.")
[Epoch 0, batch 0] loss: 5.42939, acc1: 0.00000, acc5: 0.00000
[Epoch 0, batch 1] loss: 6.13338, acc1: 0.00000, acc5: 0.03125
[Epoch 0, batch 2] loss: 7.25566, acc1: 0.03125, acc5: 0.06250
// 此处省略多行类似日志
[Epoch 9, batch 0] loss: 7.23511, acc1: 0.00000, acc5: 0.00000
[Epoch 9, batch 1] loss: 4.69053, acc1: 0.03125, acc5: 0.06250
[Epoch 9, batch 2] loss: 5.08652, acc1: 0.00000, acc5: 0.03125
I0720 12:10:53.647085 14112 tcp_store.cc:257] receive shutdown event and so quit from MasterDaemon run loop
```

至此，训练结束，业务代码结束，最后打印退出日志

```shell
LAUNCH INFO 2022-07-20 12:10:56,915 Pod completed
LAUNCH INFO 2022-07-20 12:10:57,388 Exit code 0
```

更多日志请在 log 目录下查看，日志文件命名为` {job_id}.{节点名}.{卡号}.log` , 例如如下两个文件为本例子中 2 张卡分别对应的日志。

```shell
-rw-r--r--  1 root   root 2.9K Jul 20 12:10 default.bpdjev.0.log
-rw-r--r--  1 root   root 2.7K Jul 20 12:10 default.bpdjev.1.log
```

当有错误发生时，比如 GPU 卡被占用发生冲突时，会有如下输出

```shell
LAUNCH INFO 2022-07-21 11:58:59,451 Pod failed
LAUNCH ERROR 2022-07-21 11:58:59,452 Container failed !!!
Container rank 6 status failed cmd ['/usr/bin/python', '-u', 'train.py'] code 1 log log/default.fxemxd.6.log 
env {'GREP_COLOR': '1;31', 'CUDNN_VERSION': '8.1.1.33', 'LC_ALL': 'en_US.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/lib/python3.7/dist-packages/cv2/../../lib64:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'LANG': 'en_US.UTF-8', 'HOSTNAME': 'xxxxx', 'OLDPWD': '/home/userhome', 'WITH_GPU': 'ON', 'NVIDIA_VISIBLE_DEVICES': 'all', 'NCCL_VERSION': '2.8.4', 'GOPATH': '/root/gopath', 'PWD': '/home/userhome/workspace/Paddle', 'HOME': '/home/userhome', 'GOROOT': '/usr/local/go', 'CLICOLOR': '1', 'DEBIAN_FRONTEND': 'noninteractive', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'TERM': 'xterm', 'WITH_AVX': 'ON', 'CUDA_VERSION': '11.2.1', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'SHLVL': '1', 'LANGUAGE': 'en_US.UTF-8', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450,driver<451', 'PATH': '/home/cmake-3.16.0-Linux-x86_64/bin:/usr/local/gcc-8.2/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin:/root/gopath/bin:/home/userhome/.fzf/bin', 'PS1': '\\[\\033[1;33m\\]kui \\[\\033[1;37m\\]\\h \\[\\033[1;32m\\]\\w\\[\\033[1;33m\\]$(__git_ps1 " \\[\\033[35m\\]{\\[\\033[36m\\]%s\\[\\033[35m\\]}")\\[\\033[0m\\] ', '_': '/usr/bin/python', 'CUSTOM_DEVICE_ROOT': '', 'OMP_NUM_THREADS': '1', 'QT_QPA_PLATFORM_PLUGIN_PATH': '/usr/local/lib/python3.7/dist-packages/cv2/qt/plugins', 'QT_QPA_FONTDIR': '/usr/local/lib/python3.7/dist-packages/cv2/qt/fonts', 'runtime_include_dir': '/usr/local/lib/python3.7/dist-packages/paddle/libs', 'POD_NAME': 'fxemxd', 'PADDLE_MASTER': '10.10.10.1:60216', 'PADDLE_GLOBAL_SIZE': '10', 'PADDLE_LOCAL_SIZE': '8', 'PADDLE_GLOBAL_RANK': '8', 'PADDLE_LOCAL_RANK': '6', 'PADDLE_NNODES': '2', 'PADDLE_TRAINER_ENDPOINTS': '10.10.10.1:49825,10.10.10.1:18781,10.10.10.1:53546,10.10.10.1:30837,10.10.10.1:11249,10.10.10.1:13092,10.10.10.1:11398,10.10.10.1:21309,10.10.10.1:47065,10.10.10.1:14834', 'PADDLE_CURRENT_ENDPOINT': '10.10.10.1:47065', 'PADDLE_TRAINER_ID': '8', 'PADDLE_TRAINERS_NUM': '10', 'PADDLE_RANK_IN_NODE': '6', 'FLAGS_selected_gpus': '6'}
I0721 11:58:51.079766 29676 tcp_utils.cc:130] Successfully connected to 10.10.10.1:60216
W0721 11:58:54.582710 29676 gpu_resources.cc:61] Please NOTE: device: 6, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0721 11:58:54.590724 29676 gpu_resources.cc:91] device: 6, cuDNN Version: 8.1.
Traceback (most recent call last):
  File "train.py", line 75, in <module>
    train_resnet()
  File "train.py", line 39, in train_resnet
    fleet.init(is_collective=True)
  File "/usr/local/lib/python3.7/dist-packages/paddle/distributed/fleet/base/fleet_base.py", line 319, in init
    paddle.distributed.init_parallel_env()
  File "/usr/local/lib/python3.7/dist-packages/paddle/distributed/parallel.py", line 264, in init_parallel_env
    paddle.distributed.barrier(group=group)
  File "/usr/local/lib/python3.7/dist-packages/paddle/distributed/collective.py", line 334, in barrier
    task = group.process_group.barrier()
OSError: (External) NCCL error(5), invalid usage. 
  [Hint: 'ncclInvalidUsage'. The call to NCCL is incorrect. This is usually reflecting a programming error.] (at /paddle/Paddle/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc:214)

LAUNCH INFO 2022-07-21 11:59:00,655 Exit code -15
```

这当中主要包含以下信息：

* 发生错误的提示 Pod failed 和 Container failed !!!.
* 错误的卡号（Container rank 6），错误命令和错误环境的环境变量。
* 具体的错误信息 trace，该部分取决于业务代码错误内容。
* 最后打印错误退出码 Exit code -15.

请根据报错信息进行排查，部分错误请参考 [FAQ](#3-faq)。

### 3. FAQ

#### 3.1 网络问题排查

请按照以下步骤排查网络问题

**获取节点IP**

使用命令 `hostname -i` 查看机器 ip，多网卡环境使用 `ifconfig` 命令查看(见上节)，获得 IP。

```shell
$ hostname -i
10.10.10.1
```

如果这里得到的IP非预期使用的IP或者和日志中打印的IP不相符时，请根据后面小节排查是否是多网卡环境导致使用的网卡不一致。


**确认节点间是否能通过ping连接**

这里举例获得 ip 为 10.10.10.1，在其他节点上使用 `ping 10.10.10.1` 测试机器间是否能连接，有如下输出即为连接成功

```shell
$ ping 10.10.10.1 
PING 10.10.10.1 (10.10.10.1) 56(84) bytes of data.
64 bytes from 10.10.10.1: icmp_seq=1 ttl=61 time=0.089 ms
64 bytes from 10.10.10.1: icmp_seq=2 ttl=61 time=0.057 ms
64 bytes from 10.10.10.1: icmp_seq=3 ttl=61 time=0.059 ms
64 bytes from 10.10.10.1: icmp_seq=4 ttl=61 time=0.078 ms
64 bytes from 10.10.10.1: icmp_seq=5 ttl=61 time=0.055 ms
^C
--- 10.10.10.1 ping statistics ---
5 packets transmitted, 5 received, 0% packet loss, time 4053ms
rtt min/avg/max/mdev = 0.055/0.067/0.089/0.016 ms
```

长时间无输出或其他输出即无法连接，请联系机器网络管理员处理。

**确认节点间是否能通过HTTP/TCP连接**

在机器 `10.10.10.1`上运行命令 `python -m http.server 8090` 启动 http 服务，

```shell
$ python -m http.server 8090
Serving HTTP on 0.0.0.0 port 8090 (http://0.0.0.0:8090/) ...
```

如果提示端口被占用请使用其他可用端口启动服务，然后在其他的机器上运行命令 
`curl 10.10.10.1:8090`

```shell
$ curl 10.10.10.1:8090
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Directory listing for /</title>
</head>
<body>
<h1>Directory listing for /</h1>
<hr>
<li><a href="train.py">train.py</a></li>
</ul>
<hr>
</body>
</html>
```

有类似以上输出则说明连接成功，否则两台机器间网络可能存在问题，尝试其他端口仍有问题需要联系网络管理员处理。

**确认NCCL是否运行正常**

首先，设置环境变量NCCL_DEBUG，查看NCCL版本和当前使用的IP

```shell
export NCCL_DEBUG=INFO

python -m paddle.distributed.launch train.py
```

在输出日志中找到 NCCL 版本信息

```shell
NCCL version 2.8.4+cuda11.2
```

确认各个节点的 NCCL 版本相同且高于 2.8。

以及在输出的信息中查找如下信息

```shell
[0] NCCL INFO NET/Socket : Using [0]eth0:10.10.10.1<0> [1]
```

表示 nccl 使用了名为 `eth0` ip 为 10.10.10.1 的网卡，如果需要使用其他网卡，需要在运行命令前添加环境变量

```shell
export NCCL_SOCKET_IFNAME=eth1
```

注意这里添加的时网卡名不是 ip，对应关系参照 `ifconfig` 的输出。

上述测试均正常但是无法跑通分布式环境测试时
请使用 [nccl-test](https://github.com/NVIDIA/nccl-tests)  测试 GPU 通信是否正常。

#### 3.2 多Python环境问题

当工作环境中存在多个版本的 python 时可能存在不一致导致问题。

检查 python 版本

```shell
$ python --version
Python 3.7.12
```

检查 python 安装目录

```shell
$ which python
/usr/bin/python
```

直接调用绝对路径验证版本

```shell
$ /usr/bin/python --version
Python 3.7.12
```

如果两次打印的版本不匹配，可以通过使用绝对路径的方式解决。
获取绝对路径需要知道需要安装目录，默认环境中可以通过以下命令查看安装的版本。

```shell
$ ls /usr/bin/python*
/usr/bin/python   /usr/bin/python2.7  /usr/bin/python3.6   /usr/bin/python3.7
```

即当使用 python 时，使用绝对路径 `/usr/bin/python3.7` 替换。

#### 3.3 自动获取 IP 错误（多网卡环境问题）

使用 paddle.distributed.launch 会自动识别使用的 IP，在多网卡配置的环境中自动识别的网卡可能不是预期使用的网卡。

首先可以通过 `ifconfig` 命令查看机器的网卡配置情况，例如

```shell
docker0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 10.0.3.1  netmask 255.255.255.0  broadcast 0.0.0.0
        inet6 fe80::7050:1cff:fea2:14f3  prefixlen 64  scopeid 0x20<link>
        ether 1e:a6:0d:0d:3b:1e  txqueuelen 1000  (Ethernet)
        RX packets 27201548  bytes 12176726229 (11.3 GiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 26762571  bytes 48666409371 (45.3 GiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10<host>
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 1321339447  bytes 1047567817083 (975.6 GiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 1321339447  bytes 1047567817083 (975.6 GiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 10.10.10.1  netmask 255.255.255.192  broadcast 10.127.4.191
        inet6 f080::5200:4bff:f030:2090  prefixlen 64  scopeid 0x20<link>
        ether 50:6b:4b:31:2a:90  txqueuelen 1000  (Ethernet)
        RX packets 32040749852  bytes 43394575453133 (39.4 TiB)
        RX errors 0  dropped 391107  overruns 0  frame 0
        TX packets 24330967394  bytes 30441950099144 (27.6 TiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
```

结果中虽然有3项甚至更多但这里只有一张 ip 为 `10.10.10.1` 网卡（inet值），docker0 为 Docker 虚拟网卡， lo 为本地回路，都不需要关注。

当启动分布式训练命令时，如果飞桨自动识别出的网卡IP不正确时，可以使用--host参数手动配置IP，如

```python
python -m paddle.distributed.launch --master=10.10.10.1:49178 --nnodes=2 --host=10.10.10.1 train.py
```

> 当 --master 地址识别错误时，也需要手动替换。

#### 3.4 机器端口有限制，需要使用固定端口

当集群环境限制通信网卡时需要手动配置所有 ip 和 port 以启动分布式，以机器 `10.10.10.1` 和机器 `10.10.10.2` 必须使用端口 8000-8999 的情况为例，
假设每台机器有两个卡，使用如下脚本设置每个卡对应进程的环境变量，依次启动进程。

```shell
# 所有卡 ip port 列表， ip1:port1,ip2:port2
export PADDLE_TRAINER_ENDPOINTS=10.10.10.1:8000,10.10.10.1:8001,10.10.10.2:8000,10.10.10.2:8001
# 所有卡数
export PADDLE_TRAINERS_NUM=4       
# 当前卡 ip:port
export PADDLE_CURRENT_ENDPOINT=10.10.10.1:8000
# 当前卡序号
export PADDLE_TRAINER_ID=0  
# 当前卡在节点内序号
export PADDLE_RANK_IN_NODE=0     
# 当前卡使用的 GPU 卡号
export FLAGS_selected_gpus=0

# 注意，这里不再使用 launch 启动，但本脚本需要运行多次
python train.py
```

注意在执行时，需要依次替换后面4个环境变量为对应值启动。

#### 3.5 常用的通信问题排查

GPU/NCCL 问题请先核对**版本是否匹配**，通过 `nvidia-smi` 查看是否有进程正在占用，仍有问题需要通过 [nccl-test](https://github.com/NVIDIA/nccl-tests)  测试。常见运行时错误和解决方法如下，

**NCCL error(5)**

```shell
OSError: (External) NCCL error(5), invalid usage. 
  [Hint: 'ncclInvalidUsage'. The call to NCCL is incorrect. This is usually reflecting a programming error.]
```

原因和解决方法：该错误多为同一张 GPU 卡被多个进程同时使用导致冲突，请检查正在使用 GPU 的进程。如果需要在同一台机器上启动多个逻辑节点，可以使用 `CUDA_VISIBLE_DEVICES` 环境变量控制设备可见性。

**NCCL error(2)**

```shell
ExternalError: Nccl error(2), unhandled system error
```

原因和解决方法：该错误一般为 shm 设置太小，如果使用 Docker 环境需要在启动 Docker 时做映射和设置如 `--shm-size 32G`.


================================================
FILE: docs/docker_install.md
================================================

## Docker 环境安装

使用 Docker 首先需要安装 Docker  环境，安装的完整流程请参考[文档](https://docs.docker.com/engine/install/)，基础安装流程如下所述。
另外在 Docker 中使用 GPU 还需要安装 [nvida-container-runtime](https://github.com/NVIDIA/nvidia-container-runtime)。

**Ubuntu**

添加 apt 源。
```
sudo curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable"
```

软件源升级， 安装docker

```
sudo apt-get update

sudo apt-get docker-ce docker-ce-cli containerd.io 
```

使用 `docker version` 查看 docker 版本信息无错误信息即说明安装运行正常。

安装 nvida-container-runtime

```
sudo apt-get install nvidia-container-runtimeb
```

**CentOS**

添加yum源。

```
sudo wget -O /etc/yum.repos.d/docker-ce.repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
```

安装组件。
```
sudo yum install docker-ce docker-ce-cli containerd.io
```

启动Docker。
```
sudo systemctl start docker
```

查看Docker状态。
```
sudo systemctl status docker
```

如日志状态为 active (running) 则表示docker启动正常。
```
● docker.service - LSB: start and stop docker
   Loaded: loaded (/etc/rc.d/init.d/docker; bad; vendor preset: disabled)
   Active: active (running) since Thu 2022-08-11 20:11:19 CST; 3 days ago
     Docs: man:systemd-sysv-generator(8)
  Process: 29766 ExecStop=/etc/rc.d/init.d/docker stop (code=exited, status=0/SUCCESS)
  Process: 33215 ExecStart=/etc/rc.d/init.d/docker start (code=exited, status=0/SUCCESS)
```

安装 nvida-container-runtime。

```
sudo yum install nvidia-container-runtime
```


================================================
FILE: docs/quick_start.md
================================================

# 快速开始

## 1. 环境准备

这里介绍使用裸机或者 Docker 环境使用 PaddleFleetX 的方法，用户根据具体情况选择一种安装部署方式即可。
使用多机训练时，需要在每台机器上都部署相应的环境。

### 1.1 Docker 环境部署

推荐使用 Docker 安装部署 PaddleFleetX 进行大模型训练，Docker 环境的安装可以参考[文档](docker_install.md)。

请根据本地 CUDA 版本（使用 `nvidia-smi`命令查看）使用以下命令拉取对应或兼容的镜像，

```
docker pull registry.baidubce.com/ppfleetx/fleetx-cuda11.2-cudnn8:dev
```

如本地环境cuda版本较低可以参考 Dockerfile 根据需要定制镜像。

大模型训练需要使用GPU，如已安装 nvida-container-runtime 可以使用以下命令运行镜像，

```
docker run -it --name=paddle --net=host -v /dev/shm:/dev/shm --shm-size=32G -v $PWD:/paddle --runtime=nvidia registry.baidubce.com/ppfleetx/ppfleetx-cuda11.2-cudnn8:v0.1.0 bash
```

未安装 nvida-container-runtime 或启动后无法执行 `nvidia-smi` 查看GPU信息时可以尝试通过如下脚本启动运行，

```shell
export CUDA_SO="$(\ls /usr/lib64/libcuda* | grep -v : | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | grep -v : | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(find /dev/nvidia* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')

nvsmi=`which nvidia-smi`

docker run \
${CUDA_SO} ${DEVICES} \
-v /dev/shm:/dev/shm \
-v $PWD:/paddle \
--name paddle \
--net=host \
--shm-size=32G \
-v $nvsmi:$nvsmi \
-it \
registry.baidubce.com/ppfleetx/ppfleetx-cuda11.2-cudnn8:v0.1.0 \
bash
```

以上命令 `-v $PWD:/paddle` 将当前目录映射到 /paddle 目录，在 docker 环境内部对该目录的更改将会持久化。

> 为保证通信效率和通信正常，添加参数 --net=host 使用主机网络，更多 docker run 参数说明请参考 [docker 文档](https://docs.docker.com/engine/reference/commandline/run/)。

### 1.2 裸机部署

**安装 PaddlePaddle**

首先根据环境在
[安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html) 选择对应的版本使用 pip install 执行对应命令安装 PaddlePaddle.
**请务必按照文档安装 GPU 版本且验证安装成功**。

例如使用如下命令将会安装基于 CUDA 11.2 最新版本的 PaddlePaddle. 

```shell
python -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
```

安装遇到问题以及环境验证的方法也可以参考[文档](deployment_faq.md#1-单机环境验证)。

**安装依赖**

使用以下命令安装 PaddleFleetX 运行所需依赖。

```shell
python -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple
```

## 2. 模型训练

进入环境后首先使用以下命令拉取最新代码

```shell
git clone https://github.com/PaddlePaddle/PaddleFleetX.git
```

然后根据需求选择对应的训练方式。

### 2.1. 单卡训练

切换工作目录并下载demo数据，
```
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
```

然后使用以下命令运行程序，

```shell
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型单卡训练，可将对应yaml文件中的Model-hidden size值改为原来的1/2即可。

**运行日志**

```
[2022-09-21 05:42:26,980] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.73014 sec, speed: 0.37 step/s, ips_total: 3001 tokens/s, ips: 3001 tokens/s, learning rate: 2.77778e-08
[2022-09-21 05:42:27,492] [    INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.51164 sec, speed: 1.95 step/s, ips_total: 16011 tokens/s, ips: 16011 tokens/s, learning rate: 4.16667e-08
[2022-09-21 05:42:27,997] [    INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50457 sec, speed: 1.98 step/s, ips_total: 16236 tokens/s, ips: 16236 tokens/s, learning rate: 5.55556e-08
[2022-09-21 05:42:28,503] [    INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50497 sec, speed: 1.98 step/s, ips_total: 16223 tokens/s, ips: 16223 tokens/s, learning rate: 6.94444e-08
[2022-09-21 05:42:29,009] [    INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50480 sec, speed: 1.98 step/s, ips_total: 16228 tokens/s, ips: 16228 tokens/s, learning rate: 8.33333e-08
[2022-09-21 05:42:29,513] [    INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50393 sec, speed: 1.98 step/s, ips_total: 16256 tokens/s, ips: 16256 tokens/s, learning rate: 9.72222e-08
[2022-09-21 05:42:30,018] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.988539696, avg_batch_cost: 0.50427 sec, speed: 1.98 step/s, ips_total: 16245 tokens/s, ips: 16245 tokens/s, learning rate: 1.11111e-07
```


### 2.2. 单机多卡训练

切换工作目录并下载demo数据，

```shell
mkdir data
wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
```

然后使用以下命令运行单机多卡程序，

```
python -m paddle.distributed.launch \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml
```

若要在显存容量更小的环境例如 16G 显存下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

```
python -m paddle.distributed.launch \
    ./tools/train.py -c \
    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml -o Model.hidden_size=1024
```

> 更多 launch 启动参数和用法请参考 [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/launch_cn.html)。

成功则开始训练过程，
```
LAUNCH INFO 2022-08-15 07:37:38,946 -----------  Configuration  ----------------------
LAUNCH INFO 2022-08-15 07:37:38,946 devices: None
LAUNCH INFO 2022-08-15 07:37:38,947 elastic_level: -1
LAUNCH INFO 2022-08-15 07:37:38,947 elastic_timeout: 30
LAUNCH INFO 2022-08-15 07:37:38,947 gloo_port: 6767
LAUNCH INFO 2022-08-15 07:37:38,947 host: None
LAUNCH INFO 2022-08-15 07:37:38,947 ips: None
LAUNCH INFO 2022-08-15 07:37:38,947 job_id: default
LAUNCH INFO 2022-08-15 07:37:38,947 legacy: False
LAUNCH INFO 2022-08-15 07:37:38,947 log_dir: log
LAUNCH INFO 2022-08-15 07:37:38,947 log_level: INFO
LAUNCH INFO 2022-08-15 07:37:38,947 master: None
LAUNCH INFO 2022-08-15 07:37:38,947 max_restart: 3
LAUNCH INFO 2022-08-15 07:37:38,947 nnodes: 1
LAUNCH INFO 2022-08-15 07:37:38,947 nproc_per_node: None
LAUNCH INFO 2022-08-15 07:37:38,947 rank: -1
LAUNCH INFO 2022-08-15 07:37:38,947 run_mode: collective
LAUNCH INFO 2022-08-15 07:37:38,947 server_num: None
LAUNCH INFO 2022-08-15 07:37:38,947 servers:
LAUNCH INFO 2022-08-15 07:37:38,947 start_port: 6070
LAUNCH INFO 2022-08-15 07:37:38,947 trainer_num: None
LAUNCH INFO 2022-08-15 07:37:38,947 trainers:
LAUNCH INFO 2022-08-15 07:37:38,947 training_script: run_pretrain.py
LAUNCH INFO 2022-08-15 07:37:38,947 training_script_args: ['-c', './configs_1.3B_dp8.yaml']
LAUNCH INFO 2022-08-15 07:37:38,947 with_gloo: 1
LAUNCH INFO 2022-08-15 07:37:38,947 --------------------------------------------------
LAUNCH INFO 2022-08-15 07:37:38,948 Job: default, mode collective, replicas 1[1:1], elastic False
LAUNCH INFO 2022-08-15 07:37:38,949 Run Pod: vqhbut, replicas 8, status ready
LAUNCH INFO 2022-08-15 07:37:39,063 Watching Pod: vqhbut, replicas 8, status running
## 启动配置
[2022-08-15 07:41:23,063] [    INFO] - [train] epoch: 0, batch: 0, loss: 11.255846024, avg_batch_cost: 7.06713 sec, speed: 0.14 step/s, ips_total: 9273 tokens/s, ips: 1159 tokens/s, learning rate: 2.77778e-08
## 更多训练日志
```

如有启动异常请根据[文档](deployment_faq.md#1-单机环境验证)进行工作环境验证，其他问题可参考[FAQ](deployment_faq.md#3-faq)解决。

## 2.3. 多机多卡训练

使用以下命令进行多机分布式训练，其中 --nnodes 参数为分布式训练机器数量，--master 为训练机器中其中一台机器的IP，运行时需要将命令中示例IP替换为真实的机器IP和任意可用端口，然后在**每个节点**上都运行以下命令，
如果不知道机器IP可以不设置--master参数先在一台机器上启动，然后根据提示复制命令在其他机器上启动即可。

```
python -m paddle.distributed.launch --master=10.10.10.1:8099 --nnodes=2 \
    ./tools/train.py -c \
    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml
```

> 该示例为16卡任务，需要满足总卡数为16的要求。

> 注意这里需要使用单机多卡训练部分的代码和数据。


成功则开始多机训练过程，日志和单机多卡类似，日志异常时请按照[文档](deployment_faq.md#2-分布式环境验证)进行环境验证和问题排查。

若要在显存容量更小的环境例如 16G 显存下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

```
python -m paddle.distributed.launch --master=10.10.10.1:8099 --nnodes=2 \
    ./tools/train.py -c \
    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml -o Model.hidden_size=2048
```

更多大模型多机训练内容可见[文档](../projects/gpt/docs/README.md)。


================================================
FILE: docs/standard.md
================================================
## 模型接入规范

本文讲述在PaddleFleetX repo接入一个新模型，该如何添加和修改文件，以及相应的规范化流程。

### 1.PaddleFleetX 介绍
PaddleFleetX是飞桨大模型训练推理一站式工具组件。与Paddle.distributed、Paddle.fleet API的关系如下：


<div align="center">
<img src="./images/fleetx_arc.png"  alt="drawing" width="500">

<em> PaddleFleetX与Paddle的关系 </em>
</div>


目前支持的模型列表如下：
- GPT


### 2.目录结构

整体的PaddleFleetX的目录结构如下：

```text
.
├── benchmarks                  # benchmark评估结果和示例代码
│   └── README.md
├── Dockerfile
├── docs                        # 文档
│   ├── cluster_deployment.md
│   ├── deployment_faq.md
│   ├── docker_install.md
│   ├── images
│   ├── quick_start.md
│   └── standard.md
├── ppfleetx
│   ├── configs
│   ├── core                    # 管理模型的组网规范，执行规范
│   ├── data                    # 数据集下载、预处理脚本
│   ├── models                  # 模型组网
│   ├── optims                  # 优化器类定义
│   └── utils
├── projects                    # 模型脚本，包含GPT模型
│   ├── ernie
│   ├── gpt
│   ├── imagen
│   └── vit
├── README.md
├── requirements.txt
├── tasks
│   └── gpt
└── tools
    ├── auto.py
    ├── eval.py
    ├── export_model.py
    ├── inference.py
    └── train.py
```

### 3.模型接入方法

根据模型训练的阶段不同，整体分为两个阶段：组网阶段和执行阶段。
#### 3.1 组网阶段
需要不同的分布式策略，它们会调用github/PaddlePaddle/Paddle核心框架里面的分布式高层API（FleetAPI），参考：
需要的并行方式。
- [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html)
- [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html
)
- [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html)
- [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html)


#### 3.2 执行阶段
##### BasicModule
执行阶段采用Engine模块分装，为了能够保证Engine的模块化调用，需要将组网为``BasicModule``的子类，保证其规范化输出。其中``BasicModule``提供了多个统一的函数方法：

| **函数名**                      | **参数释义**               |
|------------------------------|------------------------|
| init | 接受用户的组网参数，实现Module初始化 |
| pretreating_batch | 预处理batch数据 |
| train_step    | 一次完整的训练                  |
| train_step_end  |   一次完整的训练后的操作                |
| training_epoch_end  | 一次完整的epoch训练后的操作                  |
| validation_step    | 一次完整的验证                  |
| validation_step_end  | 一次完整的验证后的操作                  |
| validation_epoch_end  | 一次完整的epoch验证后的操作                  |
| test_step    | 一次完整的测试                  |
| test_step_end  | 一次完整的测试后的操作                  |
| configure_optimizers  | 配置这次训练的优化器                  |

##### EagerEngine
``EagerEngine``将上述函数串联起来，实现底层的执行逻辑对上层的屏蔽，减少冗余代码。
初始化需要传入对应的config配置，其层级配置如下：

```yaml
Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "O2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:
```

其中参数对应的释义如下：

| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| max_steps         | 最大训练步数                               |
| num_train_epochs  | 训练的epoch数量                           |
| accumulate_steps  | 梯度累加次数                           |
| logging_freq      | 训练日志打印的频率                            |
| eval_freq         | 模型评估间隔                               |
| eval_iters        | 模型评估时训练评估测试集的轮数                      |
| enable            | 是否使用混合精度策略进行训练                     |
| dtype             | 混合精度训练数据类型使用float16还是bfloat16，默认为float16类型 |
| level             | 混合精度训练模式，默认``O2``模式                 |
| scale_loss        | 使用fp16混合精度策略下，loss的放缩比例                  |
| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 |
| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算 |
| save_steps        | 保存模型间隔                               |
| save_epoch        | 保存模型epoch间隔                               |
| output_dir        | 指定输出文件                               |
| ckpt_dir          | checkpoint的加载目录                      |

``EagerEngine``中重载了多个常用函数，整体的说明如下：


| **函数名**                      | **参数释义**               |
|------------------------------|------------------------|
| fit | 模型训练 |
| evaluate | 模型评估 |
| predict    | 模型预测                 |
| save  |   模型参数保存                |
| load    | 模型参数加载                  |

其中module和engine函数方法的映射关系如下：

- fit
  
``fit``实现模型的训练，EagerEngine的内部调用伪代码如下：

```python
module.model.train()
for batch in train_dataloader:
    module.training_step()
    module.training_step_end()

    module.optimizer.step()
    module.lr_scheduler.step()

    module.optimizer.clear_grad()
```

- evaluate
  
``evaluate``实现模型的评估，``EagerEngine``的内部调用伪代码如下：

```python
with paddle.no_grad():
    module.model.eval()
    for batch in vailidation_dataloader:
        module.validation_step()
        module.validation_step_end()
```

- test
  
`` predict``实现模型的预测，``EagerEngine``的内部调用伪代码如下：

```python
with paddle.no_grad():
    module.model.eval()
    for batch in test_dataloader:
        module.predict_step()
        module.predict_step_end()
```


### 4.模型接入示例


1、构建组网文件，放置在`ppfleex/models`目录下。

```python
class SimpleNet(nn.Layer):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)
        self.fc2 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)
        self.fc3 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)
        self.fc4 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)
        self.fc5 = nn.Linear(IMAGE_SIZE, CLASS_NUM)

    def forward(self, image, label=None):
        output = self.fc1(image)
        output = self.fc2(output)
        output = self.fc3(output)
        output = self.fc4(output)
        return self.fc5(output)

class LossLayer(nn.Layer):
    def __init__(self):
        super(LossLayer, self).__init__()

    def forward(self, image, label=None):
        return F.cross_entropy(image, label)
```

2、构建BasicModule，设置符合要求的组网形式，放置在`ppfleetx/models`目录下；并引入`ppfleetx/models/__init__.py`

```python
class TestModule(BasicModule):
    def __init__(self):
        super().__init__()
        self.loss_fn = LossLayer()

    def get_model(self):
        model = SimpleNet()
        return model

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch):
        x, y = batch
        loss = self.loss_fn(self(x), y)
        return loss

    def training_step_end(self, log_dict):
        logger.info(
            "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost']))

    def validation_step(self, batch):
        x, y = batch
        loss = self.loss_fn(self(x), y)
        return loss

    def validation_step_end(self, log_dict):
        logger.info(
            "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['eval_cost']))

    def test_step(self, batch):
        x, y = batch
        loss = self.loss_fn(self(x), y)
        return loss

    def test_step_end(self, log_dict):
        logger.info(
            "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost']))

```
3、通过config配置Dataset

Dataset可以通过config文件进行配置。新增Dataset类型放置在 `ppfleetx/data/dataset`,同时其构造参数于其对应的Dataset字段一致。比如：

```python
class GPTDataset(paddle.io.Dataset):
    def __init__(self,
                 input_dir,
                 split,
                 max_seq_len,
                 num_samples,
                 mode,
                 seed=1234):
```
对应config中的yaml字段：

```yaml
Data:
  Train:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: DistributedBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn
```

4、通过config配置Optimizer和LR


```yaml
Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 360000
    warmup_rate: 0.01
    max_lr: 5.0e-5
    min_lr: 1.0e-5
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False
```

5、运行模型相关的配置文件以及相应的运行脚本，放置在[projects](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/projects)目录。


### 5.模型推理示例

模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。
总共分为两个步骤：模型导出和推理部署。可以参考[GPT的模型推理](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/docs/inference.md)。


================================================
FILE: examples/transformer/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: examples/transformer/models/GPT/docs/README.md
================================================
# GPT

## 模型介绍
GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)/[3](https://arxiv.org/pdf/2005.14165.pdf) 是以[Transformer](https://arxiv.org/abs/1706.03762) 解码器为网络基本组件，使用自回归的方式在大规模无标注文本语料上进行预训练得到的语言生成模型。

本项目是语言模型 GPT 的 PaddlePaddle 大模型实现。目前，PaddleFleetX 提供了 [GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型文件；分别基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl) 和 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) 数据集，采用 ACC(accuracy) 和 PPL(perplexity) 指标后的评估结果如下：

| **模型文件** | **ACC** | **PPL** |
|---------|-----------|---------------|
| GPT-345M | 44.17% |  18.01  |

下面是本例的简要目录结构及说明：

```text
.
├── docs              # 一站式文档入口
├── finetune          # GLUE 下游任务微调入口
├── generation        # 文本生成体验入口
├── offline-eval      # 模型精度离线评估入口
├── pretrain          # 预训练入口

```

## 快速开始

### 环境依赖

请确保已根据根目录 requirements.txt 安装所需依赖，或者通过以下命令快速安装

```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple
```

### 数据准备

数据获取和制作详见[GPT 模型预训练数据准备流程](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/gpt)

为了方便用户运行测试本模型，此处提供处理好的300M的训练样本，在单卡训练或混合并行训练前都需要通过以下命令获取数据。

**数据下载命令**
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

# 下载样例数据
mkdir data && cd data
wget -O gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz

cd .. # 回到 GPT 目录下
```

### 模型训练

除了单卡训练，飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略，减少显存占用、加速训练，达到大模型可训练且训得快的效果。在模型训练前，需要根据模型规模选择合适的并行策略。下面分别从单卡训练和混合并行训练两个方面来介绍GPT模型训练的配置文件和启动方式。


- [单卡训练](./single_card.md)

- [混合并行训练](./hybrid_parallel.md)


### 文本生成体验

- [单卡预训练模型文本生成](./single_card.md#GPT-Zero-shot-文本生成)

- [混合并行预训练模型文本生成](./hybrid_parallel.md#GPT-Zero-shot-文本生成)


### 模型压缩

- [量化训练](./quantization_aware_training.md)

### 推理部署

- [推理部署](inference.md)
### GLUE 下游任务微调

- [单卡微调](./single_finetune.md)


## 参数释义


### 全局信息
全局参数指定训练的batch size，以及设备、随机种子等信息；除此之外，模型训练/验证/推理等过程中的必要参数设置也在这里完成。
```yaml
  Global:
    device: gpu
    seed: 1024

    global_batch_size: 
    local_batch_size: 1
    micro_batch_size: 1

    max_steps: 500000
    num_train_epochs: 1
    accumulate_steps: 
    logging_freq: 1
    eval_freq: 500
    eval_iters: 10
    test_iters:
    mix_precision:
      enable: True
      dtype: "float16"
      level: "O2"
      scale_loss: 32768.0
      custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
      custom_white_list: ["lookup_table", "lookup_table_v2"]
    save_load:
      save_steps: 1000
      save_epoch: 1
      output_dir: ./output
      ckpt_dir:
```
其中参数对应的释义如下：

| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| device | 设备信息 |
| seed | 随机数种子 |
| global_batch_size | 全局的batch size大小，即一次参数更新等效的batch size |
| local_batch_size  | 每个进程训练的batch size大小                  |
| micro_batch_size  | 每次前向计算的batch size大小                  |
| max_steps         | 最大训练步数                               |
| num_train_epochs  | 训练的epoch数量                           |
| accumulate_steps  | 梯度累加次数                           |
| logging_freq      | 训练日志打印的频率                            |
| eval_freq         | 模型评估间隔                               |
| eval_iters        | 模型评估时训练评估测试集的轮数                      |
| test_iters        | 模型测试或推理时的轮数                      |
| enable            | 是否使用混合精度策略进行训练                     |
| dtype             | 混合精度训练数据类型使用float16还是bfloat16，默认为float16类型 |
| level             | 混合精度训练模式，默认``O2``模式                 |
| scale_loss        | 使用fp16混合精度策略下，loss的放缩比例                  |
| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 |
| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算 |
| save_steps        | 保存模型间隔step数                         |
| save_epoch        | 保存模型间隔epoch数                        |
| output_dir        | 指定输出文件                              |
| ckpt_dir          | checkpoint的加载目录                      |

### 模型网络

网络部分完成了网络的组网操作，GPT在[single_model.py](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/dygraph/single_model.py)下。 
可以使用配置文件配置模型的规模，如：

```yaml
  Model:
    name: "GPT"
    vocab_size: 50304
    hidden_size: 1024
    num_layers: 24
    num_attention_heads: 16
    ffn_hidden_size:
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 1024
    type_vocab_size: 16
    initializer_range: 0.02
    use_recompute: True
    recompute_granularity:
    no_recompute_layers:
    fused_linear: True
    fuse_attn_qkv: True
    sequence_parallel: False
```

其中参数对应的释义如下：
| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| vocab_size                   | 训练词表大小                 |
| hidden_size                  | 隐藏层大小                  |
| num_layers                   | transformer层数          |
| num_attention_heads          | attention head的数量      |
| max_seq_len                  | 输入文本序列的长度              |
| ffn_hidden_size              | ffn层大小，一般为隐藏层的四倍       |
| attention_probs_dropout_prob | attention中的dropout的失活率 |
| max_position_embeddings      | position embedding的长度  |
| type_vocab_size              | 词表类型                   |
| initializer_range            | 参数初始化的范围               |
| use_recompute     | 是否使用recompute训练                      |
| recompute_granularity | recompute训练的粒度，可选 `full` `full_attn` `core_attn`，full即recompute全部transformer，full_attn表明只recompute所有self attention部分，core_attn表明只recompute `softmax(qkT)v` 部分。注：显存占用方面，`core_attn` > `full_attn` > `full`，若所选策略产生OOM错误，可以适当更改recompute_granularity |
|no_recompute_layers| list of integer，标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐，但是会适当的增加显存。若训练中发现有显存富裕，可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误，可以适当减小不进行recompute的层数。 ｜
| fused_linear      | 是否使用fused_linear代替传统Linear加速训练。注：该功能需要cuda 11.6及以上编译的paddle支持。       |
| fuse_attn_qkv     | 是否对attention层中的qkv计算使用fuse策略以加速训练 |
| sequence_parallel | 是否使用序列并行策略以加速训练。注：只有混合并行的GPT才支持该功能，它与张量模型并行共用通信组，当mp_degree=1时，序列并行策略会被强制关闭。 |
| virtual_pp_degree | 虚拟流水线并行维度，该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯，所以该参数的推荐值为2。并且，只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时，才可以使用虚拟流水线并行。 |

### 数据集

数据集参数分为“Train”、“Eval”和“Test”三部分，分别对应模型预训练、离线评估、推理等三个模块。

每个模型的配置参数都包含以下内容：

```yaml
  Data:
    Train:
      dataset:
        name: GPTDataset
        input_dir: ./data/
        split: [949, 50, 1]
        max_seq_len: 1024
      sampler:
        name: DistributedBatchSampler
        shuffle: False
        drop_last: True
      loader:
        num_workers: 1
        return_list: False
        collate_fn: gpt_collate_fn
```

其中参数对应的释义如下：
| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| dataset.name         | 指定自定义数据集的名称  |
| input_dir         | 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件       |
| split             | 训练集，验证集和测试集的切分比例                     |
| max_seq_len       | 输入文本序列的长度                            |
| sampler.name         | 指定自定义采样器的名称  |
| shuffle         | 是否需要在生成样本下标时打乱顺序     |
| drop_last             | 是否需要丢弃最后无法凑整一个mini-batch的样本        |
| num_workers        | 用于加载数据的子进程个数  |
| return_list         | 每个设备上的数据是否以list形式返回    |
| collate_fn             | 通过此参数指定如果将样本列表组合为mini-batch数据；支持自定义     |


### 优化器


GPT训练默认使用AdamW优化器以及cosine学习率衰减，这里通过配置文件配置优化器的参数，如：

```yaml
  Optimizer:
    name: AdamW
    weight_decay: 0.01
    beta1: 0.9
    beta2: 0.999
    epsilon: 1.0e-8
    lr:
      name: CosineAnnealingWithWarmupDecay
      decay_steps: 360000
      warmup_rate: 0.01
      max_lr: 5.0e-5
      min_lr: 1.0e-5
    grad_clip:
      name: "ClipGradByGlobalNorm"
      clip_norm: 1.0
    tensor_fusion: False
```

其中参数说明：

| **参数名**      | **参数释义**                  |
|--------------|---------------------------|
| name | 指定自定义优化器的名称               |
| weight_decay | weight的衰减率                |
| beta1   | 一阶矩估计的指数衰减率               |
| beta2   | 二阶矩估计的指数衰减率               |
| epsilon | 指定优化器需要优化的参数              |
| lr.name | 指定自定义学习率策略的名称               |
| decay_steps  | 衰减的步长                     |
| warmup_rate  | warmup 率                  |
| max_lr       | Adam 的初始最大学习率             |
| min_lr       | Adam 的初始最小学习率             |
| grad_clip.name    | 指定自定义梯度裁剪策略的名称 |
| clip_norm    | 所允许的范数最大值 |
| tensor_fusion    | 是否使用tensor_fustion功能加速训练 |

另外，[Profiler](./hybrid_profiler.md)中还介绍了在 GPT 中开启 Profiler 并分析调试分析结果的方法及相关的参数解释。

### 模型压缩
PaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法：量化训练（Qutization Aware Training，QAT）、结构化稀疏（Structured Pruning，SP）和知识蒸馏（Knowledge Distillation，KD）。详细参数介绍见[模型压缩介绍](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/docs/compression.md)。


## 参考文献
- [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
- [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165.pdf)
- [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)


================================================
FILE: examples/transformer/models/GPT/docs/hybrid_parallel.md
================================================
# GPT 混合并行模型训练

当训练超大模型时，就必须借助混合并行策略，混合并行策略分别指数据并行、张量模型并行、流水线并行和分组切片并行。其中数据并行保存完整的模型参数并独立处理一份子数据集，以加速模型训练过程；张量模型并行将网络中的张量（Tensor）切分到不同的设备，从而降低单个设备的显存消耗；流水线并行将模型的不同层放置到不同的计算设备，降低单个计算设备的显存消耗；分组切片并行将参数和模型状态划分到不同卡上，每个GPU只保存部分副本，以减少显存占用。联合四种训练方式，可以实现更大模型、更快训练的效果。具体策略以及相关FleetAPI介绍可以参考以下教程：

- [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html)

- [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html
)
- [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html)

- [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html)


## 参数释义

### 并行维度

当前GPT模型已适配3D混合并行，并能够在训练超大模型，用户可以通过配置文件选择并行的维度。

```yaml
  Distributed:
    dp_degree: 2
    mp_degree: 2
    pp_degree: 2
    sharding:
      sharding_degree: 1
      sharding_stage: 1
      sharding_offload: False
      reduce_overlap: False
      broadcast_overlap: False
```

其中参数说明：

| **参数名**          | **参数释义**                             |
|------------------|--------------------------------------|
| dp_degree        | 数据并行维度                               |
| mp_degree        | 张量模型并行维度                             |
| pp_degree        | 流水线并行维度                              |
| sharding_degree  | 分组切分并行维度                             |
| sharding_stage   | 切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数 |
| sharding_offload | CPU offload策略                        |
|reduce_overlap| 是否在sharding stage 2的模式下进行reduce通讯与反向计算的overlap，该策略暂时不支持sharding_offload|
|broadcast_overlap| 是否在sharding stage 2的模式下进行broadcast通讯与下一个batch的 前向计算的overlap，该策略暂时不支持sharding_offload。若使用该模型，在evaluation与save之前，必须调用 `paddle.device.cuda.synchronize()` 方法|

## 运行方式
本目录中按照345M、1.3B、6.7B和175B规模大小，给出32G V100环境下GPT模型混合并行训练的策略配置如下：

| 模型规模 | 训练策略                 | yaml文件                   |
|----------|---------------------------|------------------------------|
| 345M     | fp16+mp8+qat              | qat_gpt_345M_mp8.yaml    |
| 1.3B     | fp16+dp8+recompute        | pretrain_gpt_1.3B_dp8.yaml   |
| 6.7B     | fp16+sharding16+recompute | pretrain_gpt_6.7B_sharding16.yaml  |
| 175B     | fp16+mp8+pp16+recompute   | pretrain_gpt_175B_mp8_pp16.yaml   |

若要在显存容量更小的16G V100环境下进行GPT大模型训练，可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。

### 策略支持

飞桨的混合并行技术包括4个维度：数据并行、张量模型并行、流水线并行和分组切片并行，此外还支持重计算、offload、混合精度、序列并行等策略，来减少显存占用、加速训练。

目前，GPT模型训练已支持前3个维度的任意策略组合，但分组切片并行stage2/3仅支持与数据并行策略组合使用；详见下表。

|                 | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute |
|-----------------|---------------|-----------------|-------------------|-----------|-----------|
| sharding stage1 | ✓             | ✓               | ✓                 | ✓         | ✓         |
| sharding stage2 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |
| sharding stage3 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |

### 单机训练

以单机1.3B模型数据并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行。

**启动命令**
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

log_dir=log_dp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    pretrain/run.py \
    -c pretrain/configs/pretrain_gpt_1.3B_dp8.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

**启动命令**
```shell
log_dir=log_dp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    pretrain/run.py \
    -c pretrain/configs/pretrain_gpt_1.3B_dp8.yaml \
    -o Model.hidden_size=1024
```

每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到；若未指定，日志路径为`log/workerlog.x`。运行日志具体内容如下：

**运行日志**

```
[2022-09-21 05:43:58,797] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.992407799, avg_batch_cost: 5.51734 sec, speed: 0.18 step/s, ips_total: 11878 tokens/s, ips: 1485 tokens/s, learning rate: 2.77778e-08
[2022-09-21 05:43:59,508] [    INFO] - [train] epoch: 0, batch: 1, loss: 11.000075340, avg_batch_cost: 0.71029 sec, speed: 1.41 step/s, ips_total: 92267 tokens/s, ips: 11533 tokens/s, learning rate: 4.16667e-08
[2022-09-21 05:44:00,242] [    INFO] - [train] epoch: 0, batch: 2, loss: 11.017463684, avg_batch_cost: 0.73301 sec, speed: 1.36 step/s, ips_total: 89406 tokens/s, ips: 11176 tokens/s, learning rate: 5.55556e-08
[2022-09-21 05:44:00,965] [    INFO] - [train] epoch: 0, batch: 3, loss: 10.983654976, avg_batch_cost: 0.72319 sec, speed: 1.38 step/s, ips_total: 90620 tokens/s, ips: 11328 tokens/s, learning rate: 6.94444e-08
[2022-09-21 05:44:01,678] [    INFO] - [train] epoch: 0, batch: 4, loss: 11.014451981, avg_batch_cost: 0.71223 sec, speed: 1.40 step/s, ips_total: 92016 tokens/s, ips: 11502 tokens/s, learning rate: 8.33333e-08
[2022-09-21 05:44:02,385] [    INFO] - [train] epoch: 0, batch: 5, loss: 11.005180359, avg_batch_cost: 0.70707 sec, speed: 1.41 step/s, ips_total: 92687 tokens/s, ips: 11586 tokens/s, learning rate: 9.72222e-08
[2022-09-21 05:44:03,100] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.989698410, avg_batch_cost: 0.71402 sec, speed: 1.40 step/s, ips_total: 91785 tokens/s, ips: 11473 tokens/s, learning rate: 1.11111e-07
[2022-09-21 05:44:03,806] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.992337227, avg_batch_cost: 0.70554 sec, speed: 1.42 step/s, ips_total: 92888 tokens/s, ips: 11611 tokens/s, learning rate: 1.25000e-07
[2022-09-21 05:44:04,516] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.972790718, avg_batch_cost: 0.71011 sec, speed: 1.41 step/s, ips_total: 92290 tokens/s, ips: 11536 tokens/s, learning rate: 1.38889e-07
[2022-09-21 05:44:05,228] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.983499527, avg_batch_cost: 0.71128 sec, speed: 1.41 step/s, ips_total: 92138 tokens/s, ips: 11517 tokens/s, learning rate: 1.52778e-07
```

### 多机训练

若需要在更多机器上进行大模型训练，则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令（master节点ip为训练所用某一台机器的ip即可）。

以2机16卡32G V100上的6.7B模型分组切分并行训练为例，启动命令为：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_sharding16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \
    pretrain/run.py -c pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型两机训练，也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_sharding16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" pretrain/run.py \
    -c pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml \
    -o Model.hidden_size=2048
```

若要执行16机175B大模型混合并行训练，以运行启动命令为：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_mp8_pp16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=16 --devices "0,1,2,3,4,5,6,7" pretrain/run.py \
    -c pretrain/configs/pretrain_gpt_175B_mp8_pp16.yaml
```

当节点较多时，可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。

### 量化训练


若需要对模型进行量化训练，按照以上在配置文件中添加量化参数，可参考`qat_gpt_345M_mp8.yaml`，量化训练时可以可以适当减少训练轮数和学习率。以单机345M模型模型并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行，命令如下：

```shell
log_dir=log_mp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" pretrain/run.py \
    -c pretrain/configs/qat_gpt_345M_mp8.yaml \
    -o Global.max_steps=100000 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 
```


# GPT Zero-shot 文本生成

## 参数释义

```yaml
Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
```

其中参数说明：

| **参数名**      | **参数释义**                  |
|--------------|---------------------------|
| top_k | 每次为采样挑选保留分数最高的 k 个 token        |
| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |
| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |
| min_dec_len | 最小生成 token 长度              |
| max_dec_len  | 最大生成 token 长度                     |
| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |
| decode_strategy       | 解码策略，默认值为 "sampling"，目前只支持 "sampling"，未来会支持 "greedy_search"，"beam_search" |

## 文本生成

下载预训练好的模型，快速体验文本生成

```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

# --devices 根据并行策略设置设备

python -m paddle.distributed.launch --devices "0" generation/run.py \
    -c generation/configs/generation_gpt_345M_dp8.yaml \
    -o Global.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/

# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容

Prompt: Hi, GPT2. Tell me who Jack Ma is.
Generation: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”

For now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.

Jack Ma on why he never wanted to run for President in 2016:

There were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.

On how Alibaba will evolve into a new player in China’s transportation and logistics sector:

I think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.
```

### 剖析体验文本生成

#### GPT 文本生成模块初始化

```python
    module = build_module(cfg)
    module.model.eval()
```

#### 预训练模型加载

```python
    # 获取到预训练 checkpoint 的根目录
    ckpt_dir = cfg.Global.save_load.ckpt_dir

    # 构造出具体路径
    model_path = os.path.join(ckpt_dir, "model.pdparams")

    # 加载模型参数
    model_dict = paddle.load(model_path)

    # FP16 模型参数转成 FP32 模型参数
    for key, value in model_dict.items():
        model_dict[key] = model_dict[key].astype(paddle.float32)

    # 设置模型参数为预训练参数
    module.model.set_state_dict(model_dict)
```

#### 文本生成与结果展示

```python
    input_text = "Historical Records: Tell us about the history of the Great Wall."
    result = module.generate(input_text)

    print(f'Prompt: {input_text}')
    print(f'Generation: {result[0]}')
```


================================================
FILE: examples/transformer/models/GPT/docs/hybrid_profiler.md
================================================
# Profiler

本文档主要包括在 GPT 中开启 Profiler 并分析调试分析结果的方法，在模型开发中使用 Profiler 分析工具的方法请参考[教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)和[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/profiler/Profiler_cn.html)。

## 参数配置

使用 Profiler 功能需要在任务配置文件中添加 Profiler 配置信息并确保字段为 `enable: True` 以开启分析器。

完整的可配置参数如下所示，可以根据使用场景调整配置。

```
Profiler:
  enable: True
  scheduler: [1, 5]
  profiler_log: log_path
  detailed: True
  record_shapes: True
  profile_memory: True
  summary:
    overview: True
    device: True
    model: True
    dist: True
    kernel: True
    op: True
    mem: True
    memcpy: True
```

其中参数说明：

| **参数名**                      | **参数释义**               |  **默认值** |
|------------------------------|------------------------|------------------------|
|  enable |   是否开启 Profiler | False |
|  scheduler  | 定义分析区间，如 [1, 5] 记录 step 1 到 step 4 的分析数据 | None |
|  profiler_log  | 日志文件目录 |   profiler_log |
|  detailed  | 是否显示详细信息 |   False |
|  record_shapes  |   是否记录 tensor shape 相关信息 | True |
|  profile_memory |   是否统计 memory 相关信息 | True |

其中，当 detailed=True 时会打印所有 summary 表格数据，当 detailed=False 时用户可以根据以下说明定制需要展示的表格信息。

| **参数名**                      | **参数释义**               |  **默认值** |
|------------------------------|------------------------|------------------------|
|  summary.overview | 显示每种类型的 Event 时间消耗 |  True |
|  summary.device | 显示 CPU 和 GPU 的平均利用率信息 |  False |
|  summary.model  | 显示模型 dataloader、forward、backward、optimization 时间消耗 |  True |
|  summary.dist  | 显示计算、通信以及重叠时间 |  False |
|  summary.kernel  | 显示 GPU 执行的 kernel 信息 |  True |
|  summary.op  | 显示框架中算子 (op) 的执行信息 |  True |
|  summary.mem  | 显示内存/显存占用统计信息 |  False |
|  summary.memcpy  | 显示框架中调用内存操作所花费的时间 | False |

## 运行分析

本节以 gpt混合并行 为例，首先进入目录，

```
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略
```


修改`pretrain/configs/pretrain_gpt_base.yaml` 中 Profiler.enable 为 True, 同时可以根据上节说明调整相关配置，或者使用命令行参数覆盖，例如可以使用以下命令运行程序，
```
python -m paddle.distributed.launch \
    ./pretrain/run.py -c \
    ./pretrain/configs/pretrain_gpt_1.3B_dp8.yaml -o Profiler.enable=True

```

> 在使用 Profiler 工具进行性能分析时，建议减少 train 的步数，获得分析数据即可停止训练。

## 结果分析

在训练结束后会有以下数据：

* 根据配置信息在控制台打印 summary 表格
* 在配置的 `profiler_log` 目录保存 profiler json 文件

这里保存的 json 文件可以通过如下两种方式查看：

* 在 chrome 浏览器中打开 chrome://tracing/，然后打开 json 文件查看
* 根据控制台信息安装并启动 `visualdl --logdir log_path` 然后根据提示在浏览器中**性能分析**模块查看

具体的信息含义解释以及分析方法请参考[文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)。

> 在使用 visualdl 时，如果 log 文件数据较大，启动会比较耗时，请耐心等待。

## 附录

控制台打印的 summary 信息示例如下所示。

**Overview Summary**
```
---------------------------------------------Overview Summary---------------------------------------------
Time unit: ms
-------------------------  -------------------------  -------------------------  -------------------------
Event Type                 Calls                      CPU Time                   Ratio (%)
-------------------------  -------------------------  -------------------------  -------------------------
ProfileStep                4                          18591.04                   100.00
  CudaRuntime              87527                      8555.11                    46.02
  Operator                 21912                      1883.11                    10.13
  UserDefined              13116                      1841.33                    9.90
  OperatorInner            33668                      1018.39                    5.48
  Forward                  8                          731.46                     3.93
  Backward                 4                          671.82                     3.61
  Optimization             4                          315.91                     1.70
  Dataloader               4                          1.37                       0.01
-------------------------  -------------------------  -------------------------  -------------------------
                           Calls                      GPU Time                   Ratio (%)
-------------------------  -------------------------  -------------------------  -------------------------
  Kernel                   16092                      4924.90                    26.49
  Memcpy                   4278                       3617.26                    19.46
  Memset                   780                        2.31                       0.01
  Communication            192                        2363.13                    12.71
-------------------------  -------------------------  -------------------------  -------------------------
```

**Model Summary**

```
-----------------------------------------------------Model Summary-----------------------------------------------------
Time unit: ms
---------------  ------  -----------------------------------------------  ---------------------------------------------  
Name             Calls   CPU Total / Avg / Max / Min / Ratio(%)           GPU Total / Avg / Max / Min / Ratio(%)         
---------------  ------  -----------------------------------------------  ---------------------------------------------  
ProfileStep      4       18591.04 / 4647.76 / 14114.47 / 757.27 / 100.00  4924.90 / 1231.22 / 2853.61 / 682.04 / 100.00  
  Dataloader     4       1.37 / 0.34 / 0.85 / 0.16 / 0.01                 0.00 / 0.00 / 0.00 / 0.00 / 0.00               
  Forward        8       731.46 / 91.43 / 133.28 / 49.03 / 3.93           714.83 / 89.35 / 174.91 / 4.72 / 14.51         
  Backward       4       671.82 / 167.96 / 168.29 / 167.52 / 3.61         1701.53 / 425.38 / 426.97 / 424.10 / 34.55     
  Optimization   4       315.91 / 78.98 / 89.07 / 73.78 / 1.70            108.27 / 27.07 / 27.09 / 27.06 / 2.20          
  Others         -       16870.48 / - / - / - / 90.75                     2400.27 / - / - / - / 48.74                    
---------------  ------  -----------------------------------------------  ---------------------------------------------  
```

**Operator Summary**

```
----------------------------------------------------------------Operator Summary-----------------------------------------------------------------
Time unit: ms
----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------
Name                                                  Calls   CPU Total / Avg / Max / Min / Ratio(%)     GPU Total / Avg / Max / Min / Ratio(%)
----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------
-----------------------------------------------------------Thread: All threads merged------------------------------------------------------------
GradNodePyLayer_RecomputeFunction_backward            96      663.37 / 6.91 / 17.17 / 4.01 / 18.56       1629.87 / 16.98 / 17.41 / 16.69 / 26.98
  TransformerDecoderLayer                             96      262.68 / 2.74 / 5.91 / 1.90 / 39.60        661.18 / 6.89 / 7.11 / 6.73 / 40.57
  backward                                            96      318.62 / 3.32 / 10.57 / 1.31 / 48.03       968.69 / 10.09 / 10.31 / 9.91 / 59.43
matmul dygraph                                        2312    200.13 / 0.09 / 1.61 / 0.04 / 5.60         1487.76 / 0.64 / 9.81 / 0.22 / 24.63
  matmul infer_meta                                   964     1.42 / 0.00 / 0.01 / 0.00 / 0.71           0.00 / 0.00 / 0.00 / 0.00 / 0.00
  matmul compute                                      964     71.38 / 0.07 / 1.59 / 0.03 / 35.67         644.02 / 0.67 / 9.81 / 0.22 / 43.29
    MEMSET                                            192     - / - / - / - / -                          0.42 / 0.00 / 0.00 / 0.00 / 0.07
    volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn      384     - / - / - / - / -                          199.35 / 0.52 / 0.83 / 0.22 / 30.95
    volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn      384     - / - / - / - / -                          263.96 / 0.69 / 0.79 / 0.59 / 40.99
    volta_h884gemm_64x128_ldg8_nn                     192     - / - / - / - / -                          141.13 / 0.74 / 0.92 / 0.61 / 21.91
    void cutlass::Kernel<cutlass_70_tensorop_f16_...  4       - / - / - / - / -                          39.15 / 9.79 / 9.81 / 9.78 / 6.08
  matmul node_creation                                676     2.05 / 0.00 / 0.03 / 0.00 / 1.02           0.00 / 0.00 / 0.00 / 0.00 / 0.00
...
```

**Kernel Summary**
```
---------------------------------------------------------------Kernel Summary---------------------------------------------------------------
Time unit: ms
------------------------------------------------------------------------------------------  ------  ----------------------------------------
Name                                                                                        Calls   GPU Total / Avg / Max / Min / Ratio(%)
------------------------------------------------------------------------------------------  ------  ----------------------------------------
ncclKernel_AllReduce_RING_LL_Sum_half(ncclWorkElem)                                         96      2360.57 / 24.59 / 2202.54 / 0.46 / 47.93
volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn                                                384     263.96 / 0.69 / 0.79 / 0.59 / 5.36
volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn                                    384     241.74 / 0.63 / 0.84 / 0.22 / 4.91
void paddle::operators::VectorizedRandomGenerator<phi::dtype::float16, unsigned char>       580     209.08 / 0.36 / 0.97 / 0.06 / 4.25
volta_h884gemm_64x128_ldg8_nn                                                               288     203.89 / 0.71 / 0.92 / 0.57 / 4.14
volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn                                                384     199.35 / 0.52 / 0.83 / 0.22 / 4.05
volta_h884gemm_256x64_ldg8_tn                                                               288     149.52 / 0.52 / 0.54 / 0.45 / 3.04
void phi::funcs::VectorizedBroadcastKernel<phi::dtype::float16, phi::dtype::float16, ph...  1352    123.12 / 0.09 / 0.40 / 0.05 / 2.50
void paddle::operators::SoftmaxMaskFuseUpperTriangleGPUKernel<phi::dtype::float16, 10>      192     122.37 / 0.64 / 0.66 / 0.60 / 2.48
void cutlass::Kernel<cutlass_70_tensorop_f16_s884gemm_f16_256x128_nt_align8>                100     103.07 / 1.03 / 8.08 / 0.73 / 2.09
void phi::funcs::VectorizedElementwiseKernel<phi::dtype::float16, paddle::operators::Cu...  292     90.80 / 0.31 / 0.83 / 0.06 / 1.84
volta_h884gemm_64x128_ldg8_nt                                                               192     79.76 / 0.42 / 0.43 / 0.40 / 1.62
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eige...  576     75.36 / 0.13 / 0.20 / 0.07 / 1.53
...
```


================================================
FILE: examples/transformer/models/GPT/docs/inference.md
================================================

# 推理部署

模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。

## 1. 模型导出

首先需要安装`ppfleetx-ops`

```bash
cd PaddleFleetX/ # 如果已在此目录下，则忽略

cd ppfleetx/ops && python setup_cuda.py install && cd ../..
```

### 1.1 非量化模型导出

以`GPT-3(345M)`模型为例，通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重，可跳过此步。

```bash
mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/
```

通过如下方式进行推理模型导出
导出单卡`GPT-3(345M)`模型：
```bash
cd PaddleFleetX/ # 如果已在此目录下，则忽略

sh projects/gpt/auto_export_gpt_345M_single_card.sh
```

导出单卡`GPT-3(6.7B)`模型：
```bash
cd PaddleFleetX/ # 如果已在此目录下，则忽略

sh projects/gpt/auto_export_gpt_6.7B_mp1.sh
```

导出8卡`GPT-3(175B)`模型：
```bash
cd PaddleFleetX/ # 如果已在此目录下，则忽略

sh projects/gpt/auto_export_gpt_175B_mp8.sh
```

### 1.2 量化模型导出

导出单卡`GPT-3(345M)`量化模型：

```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

# 为了方便快速体验，这里给出345M量化训练的模型，若已有量化模型，则无需下载
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar
tar xf GPT_345M_QAT_wo_analysis.tar

export CUDA_VISIBLE_DEVICES=0
python generation/export.py \
    -c ./generation/configs/generation_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'
```

导出单卡`GPT-3(6.7B)`量化模型：

```shell
export CUDA_VISIBLE_DEVICES=0
python generation/export.py \
    -c ./generation/configs/generation_qat_gpt_6.7B_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0
```

## 2. 推理部署

模型导出后，可通过`generation/inference.py`脚本进行推理部署。

单卡`GPT-3(345M)`非量化模型推理
```bash
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

python generation/inference.py \
    -c generation/configs/inference_gpt_345M_single_card.yaml
```


## 3. Benchmark
- 运行benchmark脚本
```
cd PaddleFleetX/ # 如果已在此目录下，则忽略

cd ppfleetx/ops && python setup_cuda.py install && cd ../..
bash projects/gpt/run_benchmark.sh
```

| 模型          | 输入长度 | 输出长度 | batch size | GPU卡数 | FP16推理时延 | INT8推理时延 |
| :------------ | :------: | :------: | :--------: | :-----: | :----------: | :----------: |
| GPT-3(345M)   |    128   |    8     |     1      |    1    |   18.91ms    |   18.30ms    |
| GPT-3(345M)   |    128   |    8     |     2      |    1    |   20.01ms    |   18.88ms    |
| GPT-3(345M)   |    128   |    8     |     4      |    1    |   20.83ms    |   20.77ms    |
| GPT-3(345M)   |    128   |    8     |     8      |    1    |   24.06ms    |   23.90ms    |
| GPT-3(345M)   |    128   |    8     |    16      |    1    |   29.32ms    |   27.95ms    |
| GPT-3(6.7B)   |    128   |    8     |     1      |    1    |   84.93ms    |   63.96ms    |
| GPT-3(6.7B)   |    128   |    8     |     2      |    1    |   91.93ms    |   67.25ms    |
| GPT-3(6.7B)   |    128   |    8     |     4      |    1    |   105.50ms   |   78.98ms    |
| GPT-3(6.7B)   |    128   |    8     |     8      |    1    |   138.56ms   |   99.54ms    |
| GPT-3(6.7B)   |    128   |    8     |    16      |    1    |   204.33ms   |   140.97ms   |
| GPT-3(175B)   |    128   |    8     |     1      |    8    |   327.26ms   |   230.11ms   |
| GPT-3(175B)   |    128   |    8     |     2      |    8    |   358.61ms   |   244.23ms   |
| GPT-3(175B)   |    128   |    8     |     4      |    8    |   428.93ms   |   278.63ms   |
| GPT-3(175B)   |    128   |    8     |     8      |    8    |   554.28ms   |   344.00ms   |
| GPT-3(175B)   |    128   |    8     |    16      |    8    |   785.92ms   |   475.19ms   |

以上性能数据基于PaddlePaddle[每日版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-develop) ，依赖CUDA 11.6测试环境。


================================================
FILE: examples/transformer/models/GPT/docs/quantization_aware_training.md
================================================

# GPT模型量化训练

本项目对语言模型 GPT 进行量化训练。目前，PaddleFleetX 提供了 [GPT-345M量化模型](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar) 的预训练模型文件；基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)，采用 ACC(accuracy) 指标后的评估结果如下：

| **模型文件** | **数据类型** | **ACC** |
|---------|-----------|---------------|
| GPT-345M | FP16 |  44.17%  |
| GPT-345M | INT8 |  44.94%  |


### 环境依赖和数据准备
环境依赖和数据准备请参考[GPT文档](./README.md)。


### 预训练模型准备
量化训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。

**预训练模型下载命令**
```shell
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar xf GPT_345M.tar.gz
```

### 量化训练

- [345M模型单卡训练](../pretrain/configs/qat_gpt_345M_single_card.yaml)

快速启动：
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

export CUDA_VISIBLE_DEVICES=0

log_dir=log_hybrid
rm -rf $log_dir

python pretrain/run.py \
    -c ./pretrain/configs/qat_gpt_345M_single_card.yaml \
    -o Global.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'
    
```

- [345M模型模型并行训练](../pretrain/configs/qat_gpt_345M_mp8.yaml)

快速启动：
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

log_dir=log_hybrid
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    pretrain/run.py \
    -c ./pretrain/configs/qat_gpt_345M_mp8.yaml \
    -o Global.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'
```

Tips：尽管设置的最大训练轮数为100000轮，但实验经验4000轮即可达到最优效果。


### 量化训练精度调优
针对生成式预训练语言模型的模型压缩一直是学界上的难点，潜在的原因目前并不清楚。经我们研究分析发现，生成式预训练语言模型的Transformer层的权重分布差异较大，且由于生成式预训练语言模型的从左到右预测的性质，量化误差会逐步累积，精度损失较大。为了保证量化模型的精度，PaddleSlim提供量化训练敏感度分析工具，可以有效定位模型某层带来的量化损失较大，以规避一些敏感层并提高量化模型精度。

PaddleSlim中的量化训练敏感度分析工具仅支持静态图模型，需要将量化模型导出为静态图模型。导出命令为：

```shell
# 下载未经过分析的量化模型
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar
tar xf GPT_345M_QAT_wo_analysis.tar

export CUDA_VISIBLE_DEVICES=0

python pretrain/export.py \
    -c ./pretrain/configs/export_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'
```

具体步骤可参考
[GPT量化训练敏感度分析示例](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/quantization_analysis/GPT/README.md)。


### 模型验证
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

# 下载验证数据
wget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl

# 下载已经训练好的量化模型
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar
tar xf GPT_345M_QAT_w_analysis.tar

export CUDA_VISIBLE_DEVICES=0
python offline-eval/run.py \
    -c ./offline-eval/configs/eval_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis' \
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True 
```

### 模型导出
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

# 下载已经训练好的量化模型，若已有量化模型，不需要下载
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar
tar xf GPT_345M_QAT_wo_analysis.tar

export CUDA_VISIBLE_DEVICES=0
python generation/export.py \
    -c ./generation/configs/generation_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Global.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'
```


================================================
FILE: examples/transformer/models/GPT/docs/single_card.md
================================================
# GPT 单卡模型训练

## 运行方式

本文档按照345M和1.3B规模大小，给出32G V100环境下GPT模型单卡训练的策略配置如下：

| 模型规模 | 训练策略       | yaml文件                    | 显存占用 |
|----------|----------------|-------------------------------|----------|
| 345M     | fp16           | pretrain_gpt_345M_single_card.yaml | 30.9GB   |
| 1.3B     | fp16+recompute | pretrain_gpt_1.3B_single_card.yaml | 26.0GB   |

**启动命令**
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

# 345M
python pretrain/run.py -c pretrain/configs/pretrain_gpt_345M_single_card.yaml

# 1.3B
python pretrain/run.py -c pretrain/configs/pretrain_gpt_1.3B_single_card.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小，或使用重计算等显存优化策略再启动训练，命令如下：

```shell
# 345M
python pretrain/run.py \
    -c pretrain/configs/pretrain_gpt_345M_single_card.yaml \
    -o Model.use_recompute=True

# 1.3B
python pretrain/run.py \
    -c pretrain/configs/pretrain_gpt_1.3B_single_card.yaml \
    -o Model.hidden_size=1024
```

**运行日志**

```
[2022-09-21 05:45:27,009] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.53083 sec, speed: 0.40 step/s, ips_total: 3237 tokens/s, ips: 3237 tokens/s, learning rate: 2.77778e-08
[2022-09-21 05:45:27,518] [    INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.50907 sec, speed: 1.96 step/s, ips_total: 16092 tokens/s, ips: 16092 tokens/s, learning rate: 4.16667e-08
[2022-09-21 05:45:28,021] [    INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50265 sec, speed: 1.99 step/s, ips_total: 16298 tokens/s, ips: 16298 tokens/s, learning rate: 5.55556e-08
[2022-09-21 05:45:28,526] [    INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50378 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 6.94444e-08
[2022-09-21 05:45:29,029] [    INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50237 sec, speed: 1.99 step/s, ips_total: 16307 tokens/s, ips: 16307 tokens/s, learning rate: 8.33333e-08
[2022-09-21 05:45:29,531] [    INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50179 sec, speed: 1.99 step/s, ips_total: 16326 tokens/s, ips: 16326 tokens/s, learning rate: 9.72222e-08
[2022-09-21 05:45:30,035] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.988540649, avg_batch_cost: 0.50379 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 1.11111e-07
[2022-09-21 05:45:30,540] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.966930389, avg_batch_cost: 0.50387 sec, speed: 1.98 step/s, ips_total: 16258 tokens/s, ips: 16258 tokens/s, learning rate: 1.25000e-07
[2022-09-21 05:45:31,044] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.980175018, avg_batch_cost: 0.50365 sec, speed: 1.99 step/s, ips_total: 16265 tokens/s, ips: 16265 tokens/s, learning rate: 1.38889e-07
[2022-09-21 05:45:31,562] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.966150284, avg_batch_cost: 0.51796 sec, speed: 1.93 step/s, ips_total: 15816 tokens/s, ips: 15816 tokens/s, learning rate: 1.52778e-07
```


# GPT 单卡模型评估

我们提供了对[WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)、[LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)两种数据集的评估脚本，其中数据集WikiText采用的是PPL(perplexity)评估指标，LAMBADA采用的是ACC(accuracy)指标。

## 参数释义

请在模型评估前将前述数据集下载到FleetX根目录下(WikiText数据集需要解压缩)，然后可以使用配置文件配置评估相关的参数，包括：

```yaml
  Offline_Eval:
    eval_path: ./wikitext-103/wiki.valid.tokens
    cloze_eval: False
    overlapping_eval: 32
    batch_size: 8
    max_seq_len: 1024
    logging_freq: 10
```

其中参数对应的释义如下：

| **参数名**                      | **参数释义**          |
|------------------------------|------------------------|
| eval_path         | 评估数据集地址                      |
| cloze_eval  | lambada数据集参数                     |
| overlapping_eval  | wikitext数据集参数              |
| batch_size         | 模型评估时batch size             |
| max_seq_len        | 模型评估时文本序列长度           |
| logging_freq     | 评估日志的打印频率                |

## 运行方式

以单卡345M模型评估为例，可以使用如下命令启动评估：

### WikiText数据集评估

```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

wget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
unzip -q wikitext-103-v1.zip

ckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/
eval_dir=./wikitext-103

python offline-eval/run.py -c offline-eval/configs/eval_gpt_345M_single_card.yaml \
    -o Global.save_load.ckpt_dir=$ckpt_dir \
    -o Offline_Eval.eval_path=$eval_dir/wiki.valid.tokens \
    -o Offline_Eval.overlapping_eval=32 \
    -o Offline_Eval.batch_size=16
```

评估日志如下：
```shell
[2022-09-21 05:28:26,263] [    INFO] - [eval] epoch: 0, batch: 0, loss: 0.170368048, speed: 0.29 step/s
[2022-09-21 05:28:39,642] [    INFO] - [eval] epoch: 0, batch: 10, loss: 0.231640193, speed: 0.75 step/s
[2022-09-21 05:28:53,469] [    INFO] - [eval] epoch: 0, batch: 20, loss: 0.292417919, speed: 0.72 step/s
[2022-09-21 05:29:07,012] [    INFO] - [eval] epoch: 0, batch: 30, loss: 0.351391476, speed: 0.74 step/s
[2022-09-21 05:29:27,359] [    INFO] - [eval] epoch: 0, batch: 40, loss: 0.415404772, speed: 0.49 step/s
```

评估结果如下：

```shell
[2022-09-21 05:40:32,820] [    INFO] - validation results on ./wikitext-103/wiki.valid.tokens | avg loss: 2.9554E+00 | ppl: 1.9210E+01 | adjusted ppl: 2.4948E+01 | token ratio: 1.0884484081583892
```

### LAMBADA数据集评估

```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

wget -O lambada_test.jsonl https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl

ckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/

python offline-eval/run.py -c offline-eval/configs/eval_gpt_345M_single_card.yaml \
    -o Global.save_load.ckpt_dir=$ckpt_dir \
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True \
    -o Offline_Eval.batch_size=16

```

评估日志如下：
```shell
[2022-09-21 05:18:24,152] [    INFO] - [eval] epoch: 0, batch: 0, number correct: 50.000000000, speed: 0.29 step/s
[2022-09-21 05:18:37,264] [    INFO] - [eval] epoch: 0, batch: 10, number correct: 130.000000000, speed: 0.76 step/s
[2022-09-21 05:18:50,408] [    INFO] - [eval] epoch: 0, batch: 20, number correct: 209.000000000, speed: 0.76 step/s
[2022-09-21 05:19:03,578] [    INFO] - [eval] epoch: 0, batch: 30, number correct: 279.000000000, speed: 0.76 step/s
[2022-09-21 05:19:16,760] [    INFO] - [eval] epoch: 0, batch: 40, number correct: 343.000000000, speed: 0.76 step/s
```

评估结果如下：

```shell
[2022-09-21 05:25:28,662] [    INFO] - validation results on ./lambada_test.jsonl | number correct: 2.1240E+03 | total examples: 5.1530E+03 | avg accuracy: 4.1219E-01
```

# GPT Zero-shot 文本生成

## 参数释义

```yaml
  Generation:
    top_k: 50
    top_p: 0.75
    temperature: 1.0
    min_dec_len: 1
    max_dec_len: 200
    num_return_sequences: 1
    decode_strategy: "sampling"
```

其中参数说明：

| **参数名**      | **参数释义**                  |
|--------------|---------------------------|
| top_k | 每次为采样挑选保留分数最高的 k 个 token        |
| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |
| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |
| min_dec_len | 最小生成 token 长度              |
| max_dec_len  | 最大生成 token 长度                     |
| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |
| decode_strategy       | 解码策略，默认值为 "sampling"，目前只支持 "sampling"，未来会支持 "greedy_search"，"beam_search" |

## 文本生成

下载预训练好的模型，快速体验文本生成

### 快速体验文本生成


```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

python generation/run.py \
    -c generation/configs/generation_gpt_345M_single_card.yaml \
    -o Global.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/

# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容

Prompt: Hi, GPT2. Tell me who Jack Ma is.
Generation: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”

For now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.

Jack Ma on why he never wanted to run for President in 2016:

There were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.

On how Alibaba will evolve into a new player in China’s transportation and logistics sector:

I think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.
```

### 剖析体验文本生成

#### GPT 文本生成模块初始化

```python
    module = build_module(cfg)
    module.model.eval()
```

#### 预训练模型加载

```python
    # 获取到预训练 checkpoint 的根目录
    ckpt_dir = cfg.Global.save_load.ckpt_dir

    # 构造出具体路径
    model_path = os.path.join(ckpt_dir, "model.pdparams")

    # 加载模型参数
    model_dict = paddle.load(model_path)

    # FP16 模型参数转成 FP32 模型参数
    for key, value in model_dict.items():
        model_dict[key] = model_dict[key].astype(paddle.float32)

    # 设置模型参数为预训练参数
    module.model.set_state_dict(model_dict)
```

#### 文本生成与结果展示

```python
    input_text = "Historical Records: Tell us about the history of the Great Wall."
    result = module.generate(input_text)

    print(f'Prompt: {input_text}')
    print(f'Generation: {result[0]}')
```


================================================
FILE: examples/transformer/models/GPT/docs/single_finetune.md
================================================
# GPT2 微调

本教程主要针对于 GLUE (General Language Understanding Evaluation) benchmark 中的数据集进行微调，涉及到分类和回归任务。

## 下载 GPT345M 预训练模型
```
# 如果已经下载可以忽略
mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/
```

## 快速体验运行

```
# cd PaddleFleetX/examples/transformer/models/GPT
# bash finetune/run_task.sh taskname [split]

# taskname 可选: CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI
# 例如 bash finetune/run_task.sh CoLA

# 注：当数据集为 MNLI 时，验证集有两种，分别是 dev_matched 和 dev_mismatched，
# 其他数据集，只有一种验证集，因此不用选择
# 可以通过 bash finetune/run_task.sh MNLI dev_matched
# 或者 bash finetune/run_task.sh MNLI dev_mismatched
# 进行 finetune 训练

bash finetune/run_task.sh SST2
```

## GLUE benchmark 数据集

GLUE benchmark 包含 9 个数据集，分别是 **CoLA**、**SST-2**、**MRPC**、**QQP**、**STS-B**、**MNLI**、**QNLI**、**RTE**、**WNLI**，涉及到 **自然语言推断**，**文本蕴含**，**情感分析**，**语义相似** 等任务，整体可以归位 3 类，分别是单句任务：CoLA、SST-2；相似性：MRPC、QQP、STS-B；释义：MNLI、QNLI、RTE、WNLI。

以下介绍载自 [huggingface](https://huggingface.co/datasets/glue/blob/main/glue.py).

* CoLA: The Corpus of Linguistic Acceptability consists of English acceptability judgments drawn from books and journal articles on linguistic theory. Each example is a sequence of words annotated with whether it is a grammatical English sentence.
* SST-2: The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. We use the two-way (positive/negative) class split, and use only sentence-level labels.
* MRPC: The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.
* QQP: The Quora Question Pairs2 dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.
* STS-B: The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 1 to 5.
* MNLI: The Multi-Genre Natural Language Inference Corpus is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are gathered from ten different sources, including transcribed speech, fiction, and government reports. We use the standard test set, for which we obtained private labels from the authors, and evaluate on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend the SNLI corpus as 550k examples of auxiliary training data.
* QNLI: The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question (written by an annotator). We convert the task into sentence pair classification by forming a pair between each question and each sentence in the corresponding context, and filtering out pairs with low lexical overlap between the question and the context sentence. The task is to determine whether the context sentence contains the answer to the question. This modified version of the original task removes the requirement that the model select the exact answer, but also removes the simplifying assumptions that the answer is always present in the input and that lexical overlap is a reliable cue.
* RTE: The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.
* WNLI: The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices. The examples are manually constructed to foil simple statistical methods: Each one is contingent on contextual information provided by a single word or phrase in the sentence. To convert the problem into sentence pair classification, we construct sentence pairs by replacing the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of new examples derived from fiction books that was shared privately by the authors of the original corpus. While the included training set is balanced between two classes, the test set is imbalanced between them (65% not entailment). Also, due to a data quirk, the development set is adversarial: hypotheses are sometimes shared between training and development examples, so if a model memorizes the training examples, they will predict the wrong label on corresponding development set example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence between a model's score on this task and its score on the unconverted original task. We call converted dataset WNLI (Winograd NLI).


## 微调相关类

### `GPTForSequenceClassification`
在 GPT 模型输出的 logits 基础上，增加一个分类层，并且用正态分布对新增的层参数进行初始化。

```
self.score = nn.Linear(self.gpt.hidden_size, num_classes, bias_attr=False)

from paddle.nn.initializer import Normal
normal_ = Normal(std=self.gpt.initializer_range)
normal_(self.score.weight)
```

## 超参数
微调训练也需要一套完整的超参数，但是微调涉及的核心超参数并不多。

### Global

| 参数字段 | 参数含义 |
| ------ | --------|
|run_mode| 运行的模式，需要设置为 epoch 方式|
|num_train_epochs| 需要 finetune 的 epoch 数 |

```
Global:
  run_mode: epoch
  num_train_epochs: 3 # WNLI 和 MRPC 数据集比较小，因此 `num_train_epochs=5`。
```

### Model

| 参数字段 | 参数含义 |
| ------ | --------|
|name | 需要设置为 "GPT" |
|num_classes | finetune 时的类别数，根据语料库以及任务来设定 |
|pretrained | 预训练的权重文件路径前缀，去掉 ".pdparams" |
|loss.train.name | finetune 时的训练损失函数类名 |
|loss.eval.name | finetune 时的验证损失函数类名 |
|metric.eval.name | finetune 时的验证评估函数类名 |

微调时，不同任务对应的类别数 和 loss 函数以及评测指标不同，因此需要通过配置来改变设置。
```
Model:
  name: "GPT"
  num_classes: 2 # 1 or 2 or 3
  pretrained: 'path/to/pretrained_model'
  
  loss:
    train:
      name: 'CrossEntropyLoss'
    eval:
      name: 'CrossEntropyLoss'
  
  metric:
    eval:
      name: 'Accuracy'
```

### Optimizer 和 LRScheduler

| 参数字段 | 参数含义 |
| ------ | --------|
|name| 优化器类名 |
|weight_decay| 权重衰减值 |
|beta1| FusedAdamW 的 beta1 |
|beta2| FusedAdamW 的 beta2 |
|epsilon| FusedAdamW 的 epsilon |
|multi_precision| 当使用 FP16 O2 级别时，是否开启参数使用多精度表示 |
|tensor_fusion| 是否开启 tensor_fusion |
|lr.name| 学习率调整策略类名 |
|lr.warmup| 当参数时小数时，表示 warmup 步数占总步数的比例，如果是整数时，则表示 warmup 的步数 |
|lr.learning_rate| 初始化学习率值 |

注：这里的超参会跟随优化器类的不同而不同，可以自行查看优化器类和学习率调整策略类初始化函数需要设置的超参数设定。

```
Optimizer:
  name: FusedAdamW
  weight_decay: 0.0
  beta1: 0.9
  beta2: 0.999
  epsilon: 1e-6
  multi_precision: True
  tensor_fusion: False
  lr:
    name: LinearDecayWithWarmup
    warmup: 0.1
    learning_rate: 2e-5
```

### Data

| 参数字段 | 参数含义 |
| ------ | --------|
|Train.dataset| 描述 finetune 时的数据集 |
|Train.sampler| 描述 dataloader 所需要的 batch sampler |
|Train.loader| 描述 dataloader 所需要的相关信息，例如 num_workers 等 |

注：数据集的设定会根据不同任务不同语料库不同而设定不同，例如 `split` 字段，不同数据集是有不同的设定，请参考所需要 finetune 的数据集初始化函数。

```
Data:
  Train:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'train'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: True
      drop_last: True
    loader:
      num_workers: 4
      return_list: False
  
  Eval:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'dev'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: False
      drop_last: False
    loader:
      num_workers: 4
      return_list: False
```

## 运行

GLUE benchmark 上的语料库 finetune，大部分设置相同，可以同享一份，只有少量区别处需要改变，因此可以通过超参数的覆盖方式来设置。

数据集加载时会自动判断是否已经缓存下载，如果未缓存下载会自行下载，请保证网络的畅通。当自动下载失败时，可以尝试多次以及检查是否有代理设置等。当下载失败时，也可以自己下载及解压到对应的目录中。

以下是 GLUE benchmark 上的每个语料库的 finetune 单机单卡启动命令：

### CoLA 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=CoLA \
  -o Data.Train.dataset.root=./dataset/cola_public/ \
  -o Data.Eval.dataset.name=CoLA \
  -o Data.Eval.dataset.root=./dataset/cola_public/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.metric.train.name=Mcc \
  -o Model.metric.eval.name=Mcc
  -o Model.num_classes=2
```

### SST2 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=SST2 \
  -o Data.Train.dataset.root=./dataset/SST-2/ \
  -o Data.Eval.dataset.name=SST2 \
  -o Data.Eval.dataset.root=./dataset/SST-2/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```

### MRPC 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Global.num_train_epochs=5 \
  -o Data.Train.dataset.name=MRPC \
  -o Data.Train.dataset.root=./dataset/MRPC/ \
  -o Data.Eval.dataset.name=MRPC \
  -o Data.Eval.dataset.root=./dataset/MRPC/ \
  -o Data.Eval.dataset.split=test \
  -o Model.num_classes=2 \
  -o Model.metric.train.name=AccuracyAndF1 \
  -o Model.metric.eval.name=AccuracyAndF1
```

### QQP 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=QQP \
  -o Data.Train.dataset.root=./dataset/QQP/ \
  -o Data.Eval.dataset.name=QQP \
  -o Data.Eval.dataset.root=./dataset/QQP/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2 \
  -o Model.metric.train.name=AccuracyAndF1 \
  -o Model.metric.eval.name=AccuracyAndF1
```

### STSB 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=STSB \
  -o Data.Train.dataset.root=./dataset/STS-B/ \
  -o Data.Eval.dataset.name=STSB \
  -o Data.Eval.dataset.root=./dataset/STS-B/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=1 \
  -o Model.metric.train.name=PearsonAndSpearman \
  -o Model.metric.eval.name=PearsonAndSpearman \
  -o Model.loss.train.name=MSELoss \
  -o Model.loss.eval.name=MSELoss
```

### MNLI 数据集

注：MNLI 数据集验证集分为 `dev_matched` 和 `dev_mismatched`，目前暂不支持两个集合同时评测，如果要评测两种验证集，有两种方法：

* 分别 finetune 2次，Data.Eval.dataset.split 设置不同的验证集
* 保存 finetune 后的 checkpoint，在不同验证集上离线评测。


```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=MNLI \
  -o Data.Train.dataset.root=./dataset/multinli_1.0 \
  -o Data.Eval.dataset.name=MNLI \
  -o Data.Eval.dataset.root=./dataset/multinli_1.0 \
  -o Data.Eval.dataset.split=dev_matched \
  -o Model.num_classes=3
```

### QNLI 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=QNLI \
  -o Data.Train.dataset.root=./dataset/QNLI/ \
  -o Data.Eval.dataset.name=QNLI \
  -o Data.Eval.dataset.root=./dataset/QNLI/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```

### RTE 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=RTE \
  -o Data.Train.dataset.root=./dataset/RTE/ \
  -o Data.Eval.dataset.name=RTE \
  -o Data.Eval.dataset.root=./dataset/RTE/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```

### WNLI 数据集
```
python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
  -o Global.num_train_epochs=5 \
  -o Data.Train.dataset.name=WNLI \
  -o Data.Train.dataset.root=./dataset/WNLI/ \
  -o Data.Eval.dataset.name=WNLI \
  -o Data.Eval.dataset.root=./dataset/WNLI/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```


## 运行结果

以下的指标是通过 [GPT_345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 预训练模型 finetune 得到的结果，仅作为参考。

| Corpus | Task                | Domanin            | Metric                       | Result                       |
| ------ | ------------------- | ------------------ | ---------------------------- | ---------------------------- |
| CoLA   | acceptability       | Misc.              | Matthews corr                | 0.60471                      |
| SST-2  | sentiment           | Movie reviews      | Accuracy                     | 0.93005                      |
| MNLI   | NLI                 | Misc.              | Matched acc./Mismatched acc. | 0.84238/0.84815              |
| QNLI   | QA/NLI              | Wikipedia          | Accuracy                     | 0.90445                      |
| RTE    | NLI                 | News, Wikipedia    | Accuracy                     | 0.70397                      |
| WNLI   | coreference         | Books              | Accuracy                     | 0.40845                      |
| MRPC   | paraphrase          | News               | Accuracy/F1                  | 0.81913/0.87022              |
| QQP    | paraphrase          | social QA question | Accuracy/F1                  | 0.86087/0.81055              |
| STS-B  | sentence similarity | Misc.              | Pearson/Spearman corr.       | 0.85797/0.85824              |


================================================
FILE: examples/transformer/models/GPT/docs/structured_pruning.md
================================================
# GPT模型结构化稀疏

本项目对语言模型 GPT 进行结构化稀疏（以下简称稀疏）。在 GPT 模型中，我们对 fused-qkv、out-linear、ffn1 和 ffn2 四层的权重进行了通道稀疏，其中，fused-qkv 和 ffn1 是在输出通道进行稀疏，out-linear 和 ffn2 是在输入通道进行稀疏。如果您需要自定义稀疏的层和通道，可以通过重写 ppfleetx/utils/compression_helper.py 中的 get_pruned_params() 函数实现。


### 环境依赖和数据准备
环境依赖和数据准备请参考[GPT训练文档](./README.md)。

特别的，本示例需要依赖 PaddleSlim develop版本。安装命令如下：

```shell
git clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim
pip install -r requirements.txt
python setup.py install
```


### 预训练模型准备
稀疏训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。

**预训练模型下载命令**
```shell
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar xf GPT_345M.tar.gz
```

### 稀疏训练

- [345M模型稀疏训练](../pretrain/configs/prune_gpt_345M_single_card.yaml)

快速启动：
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

export CUDA_VISIBLE_DEVICES=0
python pretrain/run.py \
    -c ./pretrain/configs/prune_gpt_345M_single_card.yaml \
    -o Global.max_steps=100000 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.0 \
    -o Optimizer.lr.max_lr=2.5e-5 \
    -o Optimizer.lr.min_lr=5.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'
    
```

### 模型验证
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

# 下载验证数据
wget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl

export CUDA_VISIBLE_DEVICES=0
python offline-eval/run.py \
    -c ./offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Global.save_load.ckpt_dir='./output' \
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True
```

### 模型导出
```shell
cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下，则忽略

export CUDA_VISIBLE_DEVICES=0
python generation/export.py \
    -c ./generation/configs/generation_pruned_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Global.save_load.ckpt_dir='./output'
```


================================================
FILE: examples/transformer/models/GPT/finetune/configs/finetune_gpt_345M_single_card_glue.yaml
================================================
_base_: ./finetune_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 32
  micro_batch_size: 32
  
  run_mode: epoch
  num_train_epochs: 3
  accumulate_steps:
  logging_freq: 10
  eval_freq: 1
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "reduce_mean"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  name: "GPT"
  num_classes: 2
  pretrained: './ckpt/PaddleFleetX_GPT_345M_220826/model'
  fuse_attn_qkv: True
  fused_linear: False
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  
  loss:
    train:
      name: 'CrossEntropyLoss'
    eval:
      name: 'CrossEntropyLoss'
  
  metric:
    eval:
      name: 'Accuracy'
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False
    
Optimizer:
  name: FusedAdamW
  weight_decay: 0.0
  beta1: 0.9
  beta2: 0.999
  epsilon: 1e-6
  multi_precision: True
  lr:
    name: LinearDecayWithWarmup
    warmup: 0.1
    learning_rate: 2e-5
  tensor_fusion: False
    
    
Data:
  Train:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'train'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: True
      drop_last: True
    loader:
      num_workers: 4
      return_list: False
  
  Eval:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'dev'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: False
      drop_last: False
    loader:
      num_workers: 4
      return_list: False


================================================
FILE: examples/transformer/models/GPT/finetune/configs/finetune_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 42

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1

  run_mode: epoch
  max_steps: -1
  eval_freq: 1
  eval_iters: -1
  test_iters: -1
  save_load:
    save_steps: -1
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


================================================
FILE: examples/transformer/models/GPT/finetune/impls.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import numpy as np

import paddle
import paddle.distributed as dist

from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import env
import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer
from examples.transformer.models.GPT.pretrain.impls import fit_impl as pretrain_fit_impl

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


def _get_model_size(l, h, v, s):
    P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h))
    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))


def build_model(config):
    nranks = dist.get_world_size()
    model_setting = copy.deepcopy(config.Model)

    loss_config = model_setting.pop("loss", None)
    metric_config = model_setting.pop("metric", None)
    pretrained = model_setting.pop("pretrained")
    num_classes = model_setting.pop("num_classes", 2)
    assert pretrained is not None

    l = model_setting['num_layers']
    h = model_setting['hidden_size']
    v = model_setting['vocab_size']
    num_heads = model_setting['num_attention_heads']
    s = config.Data.Train.dataset.max_length
    _get_model_size(l, h, v, s)

    model_name = model_setting.pop("name")
    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
    tokenizer = tokenizer_class.from_pretrained(pretrained_name)

    if nranks == 1:
        model = gpt.GPTForSequenceClassification(
            gpt.GPTModel(**model_setting), num_classes)
    else:
        raise NotImplementedError

    pretrained_path = pretrained + ".pdparams"
    assert os.path.exists(pretrained_path), f'{pretrained_path} is not exists!'
    model_dict = paddle.load(pretrained_path)

    # Note(GuoxiaWang): Guess whether to convert fused vs non-fused parameters.
    # 'q_proj' vs 'qkv_proj'
    def is_fused(model_state):
        for key in model_state:
            if 'qkv_proj' in key:
                return True
        return False

    def split_params(model_state, num_layers):
        for idx in range(num_layers):
            qkv_b = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias')
            qkv_w = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight')

            qkv_b = qkv_b.reshape((num_heads, 3, -1))
            qkv_w = qkv_w.reshape((h, num_heads, 3, -1))

            q_w, k_w, v_w = np.split(qkv_w, 3, axis=2)
            q_w = q_w.reshape((h, -1))
            k_w = k_w.reshape((h, -1))
            v_w = v_w.reshape((h, -1))

            q_b, k_b, v_b = np.split(qkv_b, 3, axis=1)
            q_b = q_b.reshape((-1))
            k_b = k_b.reshape((-1))
            v_b = v_b.reshape((-1))

            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias'] = q_b
            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight'] = q_w

            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias'] = k_b
            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight'] = k_w

            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias'] = v_b
            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight'] = v_w

        return model_state

    def fuse_params(model_state, num_layers):
        for idx in range(num_layers):
            q_b = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias')
            q_w = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight')

            k_b = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias')
            k_w = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight')

            v_b = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias')
            v_w = model_state.pop(
                f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight')

            q_w = q_w.reshape((h, num_heads, -1))
            k_w = k_w.reshape((h, num_heads, -1))
            v_w = v_w.reshape((h, num_heads, -1))

            qkv_w = np.stack([q_w, k_w, v_w], axis=2)
            qkv_w = qkv_w.reshape((h, -1))

            q_b = q_b.reshape((num_heads, -1))
            k_b = k_b.reshape((num_heads, -1))
            v_b = v_b.reshape((num_heads, -1))
            qkv_b = np.stack([q_b, k_b, v_b], axis=1)
            qkv_b = qkv_b.reshape((-1))

            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight'] = qkv_w
            model_state[
                f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias'] = qkv_b
        return model_state

    fused = is_fused(model.state_dict())
    load_fused = is_fused(model_dict)

    if fused is True and load_fused is False:
        model_dict = fuse_params(model_dict, l)
    elif fused is False and load_fused is True:
        model_dict = split_params(model_dict, l)

    for name, param in model.state_dict().items():
        if name in model_dict and param.dtype != model_dict[name].dtype:
            model_dict[name] = model_dict[name].cast(param.dtype)

    model.set_state_dict(model_dict)
    logger.info(f'Load pretrained weight from {pretrained_path}')

    # build loss fn
    assert loss_config is not None
    assert 'train' in loss_config and 'eval' in loss_config

    train_loss = copy.deepcopy(loss_config.train)
    train_loss_cls = train_loss.pop('name')
    train_loss_fn = eval(f'paddle.nn.loss.{train_loss_cls}')(**train_loss)

    eval_loss = copy.deepcopy(loss_config.eval)
    eval_loss_cls = eval_loss.pop('name')
    eval_loss_fn = eval(f'paddle.nn.loss.{eval_loss_cls}')(**eval_loss)

    return model, tokenizer, train_loss_fn, eval_loss_fn


def fit_impl(config, batch, forward_func, **kwargs):
    kwargs['model'].train()
    loss = pretrain_fit_impl(config, batch, forward_func, **kwargs)

    return loss


@paddle.no_grad()
def eval_impl(config, batch, model, loss_fn, eval_metric):
    model.eval()

    use_fp16 = config.Global.mix_precision.enable
    black_list = config.Global.mix_precision.custom_black_list
    white_list = config.Global.mix_precision.custom_white_list

    with paddle.amp.auto_cast(
            use_fp16,
            custom_black_list=black_list,
            custom_white_list=white_list,
            level='O2'):
        input_ids, labels = batch

        input_ids.stop_gradient = True
        labels.stop_gradient = True

        logits = model(input_ids)
        loss = loss_fn(logits, labels)
        correct = eval_metric.compute(logits, labels)
        eval_metric.update(correct)

    return loss


================================================
FILE: examples/transformer/models/GPT/finetune/run.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from ppfleetx.models.language_model import metrics
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        raise RuntimeError("Only support single-card finetune for GPT model.")
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)
    cfg.print_config(config)

    # build dataloader for training/eval
    dataset = cpn.build_dataset(config.Data.Train.dataset)
    sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset)
    train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset,
                                             sampler)

    dataset = cpn.build_dataset(config.Data.Eval.dataset)
    sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset)
    valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset,
                                             sampler)

    # build GPT model
    model, tokenizer, train_loss_fn, eval_loss_fn = impls.build_model(config)

    if config.Global.mix_precision.enable:
        scaler = paddle.amp.GradScaler(
            init_loss_scaling=config.Global.mix_precision.scale_loss)
        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when 
        # training with pure fp16 strategy, but will cause the rise of memory.
        model = paddle.amp.decorate(models=model, level='O2')
    else:
        scaler = None

    # build metric
    model_setting = copy.deepcopy(config.Model)
    metric_config = model_setting.pop("metric", None)

    assert metric_config is not None and 'eval' in metric_config

    if 'train' in metric_config:
        train_metric = copy.deepcopy(metric_config.train)
        train_metric_cls = train_metric.pop('name')
        train_metric = eval("metrics.{}".format(train_metric_cls))(
            **train_metric)

    eval_metric = copy.deepcopy(metric_config.eval)
    eval_metric_cls = eval_metric.pop('name')
    eval_metric = eval("metrics.{}".format(eval_metric_cls))(**eval_metric)

    best_metric = 0.0

    # build lr and optim
    config.Optimizer.lr.update({
        'epochs': config.Global.num_train_epochs,
        'step_each_epoch': len(train_data_loader),
        'total_steps': config.Global.max_steps,
    })

    if 'multi_precision' in config.Optimizer:
        assert config.Optimizer.pop('multi_precision') \
            == config.Global.mix_precision.enable

    lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr)
    optimizer = cpn.build_optimizer(
        config.Optimizer,
        model,
        lr_scheduler,
        multi_precision=config.Global.mix_precision.enable)

    # call fleet wrapper
    if nranks > 1:
        model, optimizer, scaler = strategy.wrap_with_fleet(
            config.Distributed, model, optimizer, scaler)

    # load pretrained checkpoints
    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}
    if config.Global.save_load.ckpt_dir is not None:
        io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train',
                load_recovery)

    # build profiler
    if config.get('Profiler', {}).get('enable', False):
        profiler = cpn.build_profiler(config.Profiler)
    else:
        profiler = None

    # start training
    assert config.Global.get('run_mode',
                             'epoch') == 'epoch', 'run_mode must be epoch'

    train_start = log.get_timestamp()

    if load_recovery['rng_state'] != -1:
        paddle.set_cuda_rng_state(load_recovery['rng_state'])

    for epoch_index in range(load_recovery['epoch'],
                             config.Global.num_train_epochs):
        train_epoch_start = log.get_timestamp()

        # time count
        train_losses = []
        train_step_start = log.get_timestamp()

        # Note(GuoxiaWang): Do not use len(train_data_loader()),
        # it will cause a memory leak.
        total_train_batch = len(train_data_loader)
        total_eval_batch = len(
            valid_data_loader) if valid_data_loader is not None else 0
        for step, batch in enumerate(train_data_loader):
            if epoch_index == load_recovery['epoch']:
                if step <= load_recovery['step']:
                    continue

            model.train()
            fit_kwargs = {
                "model": model,
                "scaler": scaler,
                "optimizer": optimizer,
                "loss_fn": train_loss_fn,
            }

            def forward_func(batch, model, loss_fn):
                input_ids, labels = batch
                input_ids.stop_gradient = True
                labels.stop_gradient = True

                logits = model(input_ids)
                loss = loss_fn(logits, labels)

                return loss

            loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs)
            train_losses.append(loss)

            # training step log
            if (step + 1) % config.Global.logging_freq == 0:
                train_step_cost = log.get_timestamp() - train_step_start
                numpy_losses = [float(loss) for loss in train_losses]

                train_cost = train_step_cost \
                    if step == 0 else train_step_cost / config.Global.logging_freq
                speed = 1. / train_cost
                default_global_tokens_num = config.Global.global_batch_size * \
                    config.Data.Train.dataset.max_length
                ips_total = speed * default_global_tokens_num
                ips = ips_total / env.get_data_world_size()

                logger.info(
                    "[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: " \
                    "%.5f sec, speed: %.2f step/s, ips_total: %.0f tokens/s, ips: %.0f tokens/s"
                    % (epoch_index, config.Global.num_train_epochs, step, total_train_batch, optimizer.get_lr(),
                    sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips))

                train_step_start = log.get_timestamp()
                train_losses = []

            if lr_scheduler is not None:
                lr_scheduler.step()

            optimizer.clear_grad()

            # save model/optim states in 'step' mode
            if step > 0 and config.Global.save_load.save_steps > 0 and \
                step % config.Global.save_load.save_steps == 0:
                device.synchronize()
                io.save(
                    config.Global.save_load.output_dir,
                    model,
                    optimizer,
                    step=step,
                    epoch=epoch_index,
                    sharding_stage=config.Distributed.sharding.sharding_stage)

            if profiler:
                profiler.step()

        # training epoch log
        train_epoch_cost = log.get_timestamp() - train_epoch_start
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (epoch_index, train_epoch_cost))

        eval_epoch_start = log.get_timestamp()

        # start eval in 'epoch' mode
        eval_step_start = log.get_timestamp()
        eval_losses = []
        total_eval_batch = len(valid_data_loader)

        for eval_step, batch in enumerate(valid_data_loader):
            loss = impls.eval_impl(config, batch, model, eval_loss_fn,
                                   eval_metric)

            eval_losses.append(float(loss))

            if eval_step % config.Global.logging_freq == 0:
                eval_step_cost = log.get_timestamp() - eval_step_start

                speed = 1. / eval_step_cost
                logger.info(
                    "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
                    % (epoch_index, eval_step, sum(eval_losses) /
                       len(eval_losses), eval_step_cost, speed))

                eval_step_start = log.get_timestamp()
                eval_losses = []

        eval_epoch_cost = log.get_timestamp() - eval_epoch_start

        # eval epoch log
        res = eval_metric.accumulate()
        eval_metric.reset()

        if isinstance(eval_metric, metrics.AccuracyAndF1):
            msg = "acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f" % (
                res[0], res[1], res[2], res[3], res[4])
            metric = res[4]
        elif isinstance(eval_metric, metrics.Mcc):
            msg = "mcc: %.5f" % (res[0])
            metric = res[0]
        elif isinstance(eval_metric, metrics.PearsonAndSpearman):
            msg = "pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f" % (
                res[0], res[1], res[2])
            metric = res[2]
        else:
            msg = "acc: %.5f" % (res)
            metric = res

        if metric > best_metric:
            best_metric = metric

        logger.info(
            "[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f" %
            (epoch_index, eval_epoch_cost, msg, best_metric))

        # save model/optim states in 'epoch' mode
        if config.Global.save_load.save_epoch > 0 and \
            epoch_index % config.Global.save_load.save_steps == 0:
            device.synchronize()
            io.save(
                config.Global.save_load.output_dir,
                model,
                optimizer,
                step=len(train_data_loader),
                epoch=epoch_index,
                sharding_stage=config.Distributed.sharding.sharding_stage)

    # training end log
    logger.info(
        "The training process is complete and total cost of time for training is : {}".
        format(
            log.convert_timestamp_to_data(log.get_timestamp() - train_start)))

    if profiler:
        cpn.profiler_done(profiler, config.Profiler)


================================================
FILE: examples/transformer/models/GPT/finetune/run_task.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0

# Single-Sentence Tasks
if [ $1 == "CoLA" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=CoLA \
      -o Data.Train.dataset.root=./dataset/cola_public/ \
      -o Data.Eval.dataset.name=CoLA \
      -o Data.Eval.dataset.root=./dataset/cola_public/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.metric.train.name=Mcc \
      -o Model.metric.eval.name=Mcc \
      -o Model.num_classes=2
elif [ $1 == "SST2" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=SST2 \
      -o Data.Train.dataset.root=./dataset/SST-2/ \
      -o Data.Eval.dataset.name=SST2 \
      -o Data.Eval.dataset.root=./dataset/SST-2/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
# Similarity and Paraphrase Tasks
elif [ $1 == "MRPC" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Global.num_train_epochs=5 \
      -o Data.Train.dataset.name=MRPC \
      -o Data.Train.dataset.root=./dataset/MRPC/ \
      -o Data.Eval.dataset.name=MRPC \
      -o Data.Eval.dataset.root=./dataset/MRPC/ \
      -o Data.Eval.dataset.split=test \
      -o Model.num_classes=2 \
      -o Model.metric.train.name=AccuracyAndF1 \
      -o Model.metric.eval.name=AccuracyAndF1
elif [ $1 == "QQP" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=QQP \
      -o Data.Train.dataset.root=./dataset/QQP/ \
      -o Data.Eval.dataset.name=QQP \
      -o Data.Eval.dataset.root=./dataset/QQP/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2 \
      -o Model.metric.train.name=AccuracyAndF1 \
      -o Model.metric.eval.name=AccuracyAndF1
elif [ $1 == "STSB" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=STSB \
      -o Data.Train.dataset.root=./dataset/STS-B/ \
      -o Data.Eval.dataset.name=STSB \
      -o Data.Eval.dataset.root=./dataset/STS-B/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=1 \
      -o Model.metric.train.name=PearsonAndSpearman \
      -o Model.metric.eval.name=PearsonAndSpearman \
      -o Model.loss.train.name=MSELoss \
      -o Model.loss.eval.name=MSELoss
# Inference Tasks
elif [ $1 == "MNLI" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=MNLI \
      -o Data.Train.dataset.root=./dataset/multinli_1.0 \
      -o Data.Eval.dataset.name=MNLI \
      -o Data.Eval.dataset.root=./dataset/multinli_1.0 \
      -o Data.Eval.dataset.split=${2:-"dev_matched"} \
      -o Model.num_classes=3
elif [ $1 == "QNLI" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=QNLI \
      -o Data.Train.dataset.root=./dataset/QNLI/ \
      -o Data.Eval.dataset.name=QNLI \
      -o Data.Eval.dataset.root=./dataset/QNLI/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
elif [ $1 == "RTE" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=RTE \
      -o Data.Train.dataset.root=./dataset/RTE/ \
      -o Data.Eval.dataset.name=RTE \
      -o Data.Eval.dataset.root=./dataset/RTE/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
elif [ $1 == "WNLI" ]
then
    python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \
      -o Global.num_train_epochs=5 \
      -o Data.Train.dataset.name=WNLI \
      -o Data.Train.dataset.root=./dataset/WNLI/ \
      -o Data.Eval.dataset.name=WNLI \
      -o Data.Eval.dataset.root=./dataset/WNLI/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
else
   echo "Task name not recognized, please input CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI."
fi


================================================
FILE: examples/transformer/models/GPT/generation/configs/generation_gpt_345M_dp8.yaml
================================================
_base_: ./generation_gpt_base.yaml

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"

Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/generation/configs/generation_gpt_345M_single_card.yaml
================================================
_base_: ./generation_gpt_base.yaml

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"


================================================
FILE: examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8

  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  name: "GPT"
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  fused_linear: False
  fuse_attn_qkv: True
  sequence_parallel: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/generation/configs/generation_pruned_gpt_345M_single_card.yaml
================================================
_base_: ./generation_gpt_base.yaml

Compress:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"


================================================
FILE: examples/transformer/models/GPT/generation/configs/generation_qat_gpt_345M_single_card.yaml
================================================
_base_: ./generation_gpt_base.yaml

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
  use_topp_sampling: True
  inference: True


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: examples/transformer/models/GPT/generation/configs/generation_qat_gpt_6.7B_single_card.yaml
================================================
_base_: ./generation_gpt_base.yaml

Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size: 16384
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:


Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
  use_topp_sampling: True
  inference: True

Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: examples/transformer/models/GPT/generation/configs/inference_gpt_345M_dp8.yaml
================================================
_base_: ./generation_gpt_345M_dp8.yaml


Inference:
  model_dir: ./output
  mp_degree: 1


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/generation/configs/inference_gpt_345M_single_card.yaml
================================================
_base_: ./generation_gpt_345M_single_card.yaml


Inference:
  model_dir: ./output
  mp_degree: 1


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/generation/export.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist
from paddle.static import InputSpec

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from ppfleetx.utils.export import export_inference_model
from examples.transformer.utils import qat
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)

    cfg.process_configs(config)
    cfg.print_config(config)

    if config.Global.mix_precision.enable:
        logger.info("NOTE: disable mix_precision in export mode")

    # build GPT model
    model, _ = impls.build_model(config)

    # export
    model.eval()
    input_spec = [
        InputSpec(
            shape=[None, None], name="input_ids", dtype='int64')
    ]

    output_dir = config.Global.save_load.output_dir
    dp_rank = 0 if nranks == 1 else env.get_hcg().get_data_parallel_rank()
    save_dir = os.path.join(output_dir, "rank_{}".format(dp_rank))

    quanter = None
    quant_mode = False

    if 'Compress' in config:
        mode = 'compress'
        compress_configs = config['Compress']

        if "Quantization" in compress_configs:
            quant_mode = True

        model, quanter = qat.compress_model(config, model, input_spec)

    # load pretrained checkpoints
    if config.Global.save_load.ckpt_dir is not None:
        io.load(
            config.Global.save_load.ckpt_dir,
            model,
            optimizer=None,
            mode='export',
            load_recovery=None)

    if not quant_mode:
        export_inference_model(model, input_spec, save_dir, 'model')
    else:
        logger.info("export quantized model.")
        export_inference_model(
            model,
            input_spec,
            save_dir,
            'model',
            export_quant_model=True,
            quanter=quanter)


================================================
FILE: examples/transformer/models/GPT/generation/impls.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import numpy as np

import paddle
import paddle.distributed as dist

from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import env
import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer
from examples.transformer.models.GPT.pretrain.impls import fit_impl as pretrain_fit_impl

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


def adjust_length_to_model(length, max_sequence_length):
    if length < 0 or length > max_sequence_length:
        length = max_sequence_length
    return length


def build_model(config):
    nranks = dist.get_world_size()
    generation_cfgs = config.Generation

    model_setting = copy.deepcopy(config.Model)
    if 'Compress' in config and 'Quantization' in config.Compress:
        quant_setting = copy.deepcopy(config.Compress.Quantization)
        skip_tensor_map = quant_setting.get('skip_tensor_map', {})
        freeze_embedding = quant_setting.get('freeze_embedding', False)
        model_setting['skip_tensor_map'] = skip_tensor_map
        model_setting['freeze_embedding'] = freeze_embedding

    model_name = model_setting.pop("name")
    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
    tokenizer = tokenizer_class.from_pretrained(pretrained_name)

    if nranks == 1:
        model = gpt.GPTForGeneration(
            gpt.GPTModel(**model_setting), generation_cfgs)
    else:
        assert nranks == config.Distributed.dp_degree, \
            "only support single card and data parallel in generation task."
        model = gpt.GPTForGenerationHybrid(
            gpt.GPTModelHybrid(**model_setting), generation_cfgs)

    generation_cfgs['max_dec_len'] = adjust_length_to_model(
        generation_cfgs['max_dec_len'], 512)

    generation_cfgs['bos_token_id'] = tokenizer.eos_token_id
    generation_cfgs['eos_token_id'] = tokenizer.eos_token_id
    generation_cfgs['pad_token_id'] = tokenizer.eos_token_id

    return model, tokenizer


def left_padding(inputs, pad_id, padding="longest"):
    assert "input_ids" in inputs, "input_ids should be in inputs!"
    max_length = 0
    for ids in inputs["input_ids"]:
        max_length = max(max_length, len(ids))

    def extend_max_lenth(value, max_length, to_pad_id):
        return [to_pad_id] * (max_length - len(value)) + value

    def extend_filed(name, max_length, to_pad_id):
        values = inputs[name]
        res = []
        for index, value in enumerate(values):
            res.append(extend_max_lenth(value, max_length, to_pad_id))
        inputs[name] = res

    extend_filed("input_ids", max_length, pad_id)
    if "attention_mask" in inputs:
        extend_filed("attention_mask", max_length, 0)
    if "position_ids" in inputs:
        extend_filed("position_ids", max_length, 0)

    return inputs


================================================
FILE: examples/transformer/models/GPT/generation/inference.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from ppfleetx.core.engine import InferenceEngine, TensorRTConfig
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)
    cfg.process_configs(config)

    # build model
    model, tokenizer = impls.build_model(config)
    model.eval()

    if 'Inference' in config:
        inference_configs = config['Inference']
        inference_engine = None
    else:
        raise RuntimeError(f'No Inference in config')

    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'
    input_ids = [tokenizer.encode(input_text)]

    if inference_engine is None:
        # parse TensorRT config
        tensorrt_config = None
        if 'TensorRT' in inference_configs:
            tensorrt_config = TensorRTConfig(**inference_configs['TensorRT'])

        inference_engine = InferenceEngine(inference_configs['model_dir'],
                                           inference_configs['mp_degree'],
                                           tensorrt_config)

    outs = inference_engine.predict([input_ids])

    ids = list(outs.values())[0]
    out_ids = [int(x) for x in ids[0]]
    result = tokenizer.decode(out_ids)
    result = input_text + result

    print('Prompt:', input_text)
    print('Generation:', result)


================================================
FILE: examples/transformer/models/GPT/generation/run.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist
from paddle.static import InputSpec

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from examples.transformer.utils import qat
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)
    cfg.process_configs(config)

    # build model
    model, tokenizer = impls.build_model(config)

    if 'Compress' in config:
        input_spec = [
            InputSpec(
                shape=[None, None], name="input_ids", dtype='int64')
        ]
        model, quanter = qat.compress_model(config, model, input_spec)

    model.eval()
    cfg.print_config(config)

    # call fleet wrapper
    if nranks > 1:
        model, _, _ = strategy.wrap_with_fleet(
            config.Distributed, model, optimizer=None, scaler=None)

    # load pretrained checkpoints
    if config.Global.save_load.ckpt_dir is not None:
        io.load(
            config.Global.save_load.ckpt_dir,
            model,
            optimizer=None,
            mode='generation',
            load_recovery=None)

    # build profiler
    if config.get('Profiler', {}).get('enable', False):
        profiler = cpn.build_profiler(config.Profiler)
    else:
        profiler = None

    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'
    input_ids = tokenizer.encode(input_text)
    inputs = {'input_ids': [input_ids]}

    inputs = impls.left_padding(inputs, tokenizer.eos_token_id)
    input_ids = inputs['input_ids']

    if len(input_ids) == 0:
        input_ids = None
    else:
        # [1, seq_len]
        input_ids = paddle.to_tensor(input_ids, dtype='int64')

    ids, scores = model(input_ids=input_ids)

    result = []
    for i, generated_ids in enumerate(ids):
        generated_ids = generated_ids.numpy().tolist()
        # Decode text
        text = tokenizer.convert_ids_to_string(generated_ids)
        sequence = input_text + text
        result.append(sequence)

    print(f'Prompt: {input_text}')
    print(f'Generation: {result[0]}')

    if profiler:
        cpn.profiler_done(profiler, config.Profiler)


================================================
FILE: examples/transformer/models/GPT/offline-eval/configs/eval_gpt_345M_single_card.yaml
================================================
_base_: ./eval_gpt_base.yaml


Offline_Eval:
  eval_path: ./wikitext-103/wiki.valid.tokens
  cloze_eval: False
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8

  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  name: "GPT"
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  fused_linear: False
  fuse_attn_qkv: True
  sequence_parallel: False


Data:
  Eval:
    dataset:
      name: LM_Eval_Dataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
      overlapping_eval: 
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Offline_Eval:
  eval_path: ./wikitext-103/wiki.valid.tokens
  cloze_eval: False
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: examples/transformer/models/GPT/offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml
================================================
_base_: ./eval_gpt_base.yaml


Model:
  hidden_dropout_prob: 0.0
  attention_probs_dropout_prob: 0.0


Compress:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125


Offline_Eval:
  eval_path: ./lambada_test.jsonl
  cloze_eval: True
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: examples/transformer/models/GPT/offline-eval/configs/eval_qat_gpt_345M_single_card.yaml
================================================
_base_: ./eval_gpt_base.yaml


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
    skip_tensor_map: 
      block_3: ['linear2']
      block_5: ['linear1']
      block_6: ['linear2']
      block_7: ['linear2']
      block_10: ['linear2']
      block_20: ['linear2']
      block_21: ['linear2']


Offline_Eval:
  eval_path: ./wikitext-103/wiki.valid.tokens
  cloze_eval: False
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: examples/transformer/models/GPT/offline-eval/impls.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import numpy as np
import json
import re
import math

import paddle
import paddle.distributed as dist
from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import env
from ppfleetx.models.language_model import gpt
from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


def build_model(config):
    nranks = dist.get_world_size()
    model_setting = copy.deepcopy(config.Model)

    if 'Compress' in config and 'Quantization' in config.Compress:
        quant_setting = copy.deepcopy(config.Compress.Quantization)
        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',
                                                             {})
        model_setting['freeze_embedding'] = quant_setting.get(
            'freeze_embedding', False)

    model_name = model_setting.pop("name")
    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
    tokenizer = tokenizer_class.from_pretrained(pretrained_name)

    if nranks == 1:
        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
    else:
        raise RuntimeError(
            "Only single-card offline eval is supported in GPTModel now.")

    return model, tokenizer


@paddle.no_grad()
def eval_impl(config, batch, model):
    model.eval()

    use_fp16 = config.Global.mix_precision.enable
    black_list = config.Global.mix_precision.custom_black_list
    white_list = config.Global.mix_precision.custom_white_list

    with paddle.amp.auto_cast(
            use_fp16,
            custom_black_list=black_list,
            custom_white_list=white_list,
            level='O2'):

        tokens, loss_mask, attention_mask, position_ids, labels = batch
        preds = model(tokens, position_ids, attention_mask)

        if not config.Offline_Eval.cloze_eval:
            masked_lm_loss = paddle.nn.functional.cross_entropy(
                preds, labels, reduction="none")
            loss = paddle.sum(masked_lm_loss * loss_mask)

            return loss
        else:
            outputs = paddle.argmax(preds, -1)
            acc = paddle.cast(outputs == labels, 'float32')
            acc = paddle.where(
                paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc))
            acc = paddle.sum(paddle.prod(acc, -1))

            return acc


class LM_Eval_Dataset(paddle.io.Dataset):
    def __init__(self,
                 tokens,
                 max_seq_len,
                 eos_token_id,
                 overlapping_eval=None,
                 **kwargs):
        self.tokens = tokens
        self.seq_len = max_seq_len
        self.pad_idx = eos_token_id
        self.overlapping_eval = overlapping_eval
        if self.overlapping_eval is None:
            self.overlapping_eval = self.seq_len
        self.overlapping_eval = max(1, self.overlapping_eval)

        self.total_targets = len(self.tokens) - 1
        # remove first sequence tokens
        targets = max(self.total_targets - self.overlapping_eval, 0)
        self.total_sequences = max(
            math.ceil(targets / self.overlapping_eval) + 1, 1)

    def __len__(self):
        return self.total_sequences

    def _construct_sample(self, tokens):
        tokens = np.array(tokens).astype("int64").tolist()
        labels = tokens[1:]
        tokens = tokens[:-1]
        seq_length = len(tokens)
        # attention mask for the attention calulate
        attention_mask = np.tri(seq_length, seq_length).reshape(
            (1, seq_length, seq_length))

        # the pad and eos tokens do not contribute the loss
        loss_mask = np.ones(seq_length, dtype="float32")
        loss_mask[np.where(np.array(tokens) == self.pad_idx)] = 0.0
        position_ids = np.arange(0, seq_length, dtype="int64")

        # -INF mask value as default
        # attention_mask = (attention_mask - 1.0) * 1e9
        # Bool mask of attention
        attention_mask = attention_mask.astype("float32")
        return [tokens, loss_mask, attention_mask, position_ids, labels]

    def __getitem__(self, idx):
        start_idx = idx * self.overlapping_eval
        end_idx = start_idx + self.seq_len
        tokens = self.tokens[start_idx:end_idx + 1]
        num_tokens = len(tokens)
        if num_tokens < self.seq_len + 1:
            num_pad = (self.seq_len + 1 - num_tokens)
            tokens += [self.pad_idx] * num_pad
        [tokens, loss_mask, attention_mask, position_ids,
         labels] = self._construct_sample(tokens)
        if self.overlapping_eval != self.seq_len and idx != 0:
            loss_mask[:-self.overlapping_eval] *= 0

        return [tokens, loss_mask, attention_mask, position_ids, labels]


class Lambada_Eval_Dataset(paddle.io.Dataset):
    def __init__(self, tokens, labels, max_seq_len, eos_token_id, **kwargs):
        self.pad_idx = eos_token_id
        self.seq_len = max_seq_len
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def _construct_sample(self, tokens):
        tokens = np.array(tokens).astype("int64").tolist()
        labels = tokens[1:]
        tokens = tokens[:-1]

        seq_length = len(tokens)
        # attention mask for the attention calulate
        attention_mask = np.tri(seq_length, seq_length).reshape(
            (1, seq_length, seq_length))

        # the pad and eos tokens do not contribute the loss
        position_ids = np.arange(0, seq_length, dtype="int64")

        # -INF mask value as default
        #attention_mask = (attention_mask - 1.0) * 1e9
        # Bool mask of attention
        attention_mask = attention_mask.astype("float32")
        return [tokens, attention_mask, position_ids, labels]

    def __getitem__(self, idx):
        tokens = self.tokens[idx][:self.seq_len]
        labels = self.labels[idx]
        tokens = tokens + labels
        num_tokens = len(tokens)
        if num_tokens < self.seq_len + 1:
            num_pad = (self.seq_len + 1 - num_tokens)
            tokens += [self.pad_idx] * num_pad
        loss_mask = np.zeros(self.seq_len, dtype="float32")
        loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1.
        [tokens, attention_mask, position_ids,
         labels] = self._construct_sample(tokens)
        return [tokens, loss_mask, attention_mask, position_ids, labels]


def wikitext_detokenizer(string):
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)

    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")

    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")

    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)

    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string


def get_tokens(tokenizer, text, strict=True):
    if not strict:
        tokens = tokenizer.encode(text)
        return tokens[:-1], [tokens[-1]]
    last_token = text.split()[-1]
    start_idx = text.rfind(last_token)
    beginning_tokens = tokenizer.encode(text[:start_idx].strip())
    last_token = tokenizer.encode(' ' + last_token)
    return beginning_tokens, last_token


================================================
FILE: examples/transformer/models/GPT/offline-eval/run.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import json
import math

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist
from paddle.static import InputSpec

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from ppfleetx.models.language_model import gpt
from examples.transformer.utils import qat
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn
import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)

    # process configs
    eval_cfgs = config.Offline_Eval
    config.Data.Eval.pop("sampler", None)
    config.Data.Eval.loader.collate_fn = "gpt_collate_fn"
    config.Data.Eval.loader.batch_size = eval_cfgs.batch_size
    config.Data.Eval.dataset.input_dir = eval_cfgs.eval_path
    config.Data.Eval.dataset.max_seq_len = eval_cfgs.max_seq_len
    config.Global.logging_freq = eval_cfgs.logging_freq

    if not eval_cfgs.cloze_eval:
        config.Data.Eval.dataset.name = "LM_Eval_Dataset"
        config.Data.Eval.dataset.overlapping_eval = eval_cfgs.overlapping_eval
    else:
        config.Data.Eval.dataset.name = "Lambada_Eval_Dataset"

    cfg.print_config(config)

    # build GPT model
    model, tokenizer = impls.build_model(config)

    if 'Compress' in config:
        input_spec = [
            InputSpec(
                shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                    shape=[None, None], name="ids", dtype='int64')
        ]
        model, quanter = qat.compress_model(config, model, input_spec)

    if config.Global.mix_precision.enable:
        scaler = paddle.amp.GradScaler(
            init_loss_scaling=config.Global.mix_precision.scale_loss)
        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when 
        # training with pure fp16 strategy, but will cause the rise of memory.
        model = paddle.amp.decorate(models=model, level='O2')
    else:
        scaler = None

    # load pretrained checkpoints
    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}
    if config.Global.save_load.ckpt_dir is not None:
        io.load(
            config.Global.save_load.ckpt_dir,
            model,
            optimizer=None,
            mode='eval',
            load_recovery=load_recovery)

    # build dataset for eval
    if not eval_cfgs.cloze_eval:
        with open(eval_cfgs.eval_path, "rb") as reader:
            entire_data = reader.read().decode('utf-8')

        num_original_tokens = len(entire_data.strip().split(" "))
        entire_data = impls.wikitext_detokenizer(entire_data)
        tokenized_data = tokenizer.encode(entire_data)
        num_tokenized_tokens = len(tokenized_data)
        print('Original Tokens: %d, Detokenized tokens: %d' %
              (num_original_tokens, num_tokenized_tokens))

        dataset = impls.LM_Eval_Dataset(
            tokens=tokenized_data,
            max_seq_len=eval_cfgs.max_seq_len,
            overlapping_eval=eval_cfgs.overlapping_eval,
            eos_token_id=tokenizer.eos_token_id)
    else:
        tokenized_data = []
        tokenized_label = []

        with open(eval_cfgs.eval_path, 'r') as f:
            for line in f.readlines():
                text = json.loads(line)['text']
                tokens, labels = impls.get_tokens(tokenizer, text)
                tokenized_data.append(tokens)
                tokenized_label.append(labels)

        dataset = impls.Lambada_Eval_Dataset(
            tokens=tokenized_data,
            labels=tokenized_label,
            max_seq_len=eval_cfgs.max_seq_len,
            eos_token_id=tokenizer.eos_token_id)

        num_examples = len(dataset)

    # build dataloader for eval
    valid_data_loader = cpn.build_dataloader(
        config.Data.Eval.loader, dataset, batch_sampler=None)

    # build profiler
    if config.get('Profiler', {}).get('enable', False):
        profiler = cpn.build_profiler(config.Profiler)
    else:
        profiler = None

    # start eval
    model.eval()
    total_score = 0
    score_name = "loss" if not eval_cfgs.cloze_eval else "number correct"
    eval_start = log.get_timestamp()

    if load_recovery['rng_state'] != -1:
        paddle.set_cuda_rng_state(load_recovery['rng_state'])

    for epoch_index in range(config.Global.num_train_epochs):
        eval_epoch_start = log.get_timestamp()

        eval_step_start = log.get_timestamp()
        eval_losses = []
        total_eval_batch = len(valid_data_loader)

        for eval_step, batch in enumerate(valid_data_loader):
            loss = impls.eval_impl(config, batch, model)
            eval_losses.append(float(loss))

            if eval_step > 0 and eval_step % config.Global.logging_freq == 0:
                eval_step_cost = log.get_timestamp() - eval_step_start
                speed = config.Global.logging_freq / eval_step_cost
                eval_loss = sum(eval_losses) / len(eval_losses)

                if not eval_cfgs.cloze_eval:
                    total_score += eval_loss * config.Global.logging_freq / (
                        num_tokenized_tokens - 1)
                else:
                    total_score += eval_loss * config.Global.logging_freq

                logger.info(
                    "[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s"
                    % (epoch_index, eval_step, score_name, total_score, speed))

                eval_step_start = log.get_timestamp()
                eval_losses = []

            if eval_step >= config.Global.max_steps:
                break

        eval_epoch_cost = log.get_timestamp() - eval_epoch_start
        logger.info(
            "[eval] epoch {} : evaluting process is complete and cost {}".
            format(epoch_index, log.convert_timestamp_to_data(
                eval_epoch_cost)))

        string = '[eval] epoch {} : validation results on {} | '.format(
            epoch_index, eval_cfgs.eval_path)

        if not eval_cfgs.cloze_eval:
            total_loss = float(total_score)
            ppl = math.exp(min(20, total_loss))
            token_ratio = (num_tokenized_tokens - 1) / (
                num_original_tokens - 1)
            adjusted_ppl = math.exp(min(20, total_loss * token_ratio))

            string += 'avg loss: {:.4E} | '.format(total_loss)
            string += 'ppl: {:.4E} | '.format(ppl)
            string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
            string += 'token ratio: {} |'.format(token_ratio)
        else:
            num_correct = float(total_score)
            acc = float(num_correct / num_examples)

            string += 'number correct: {:.4E} | '.format(num_correct)
            string += 'total examples: {:.4E} | '.format(num_examples)
            string += 'avg accuracy: {:.4E}'.format(acc)

        logger.info(string)

    # evaluting end log
    logger.info(
        "The evaluting process is complete and total cost of time for evaluting is : {}".
        format(
            log.convert_timestamp_to_data(log.get_timestamp() - eval_start)))

    del valid_data_loader

    if profiler:
        cpn.profiler_done(profiler, config.Profiler)


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/export_qat_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_1.3B_dp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 8
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_1.3B_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_175B_mp8_pp16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 1536
  micro_batch_size: 1


Model:
  vocab_size: 51200
  hidden_size: 12288
  num_layers: 96
  num_attention_heads: 96
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity: 'core_attn'
  no_recompute_layers:
  virtual_pp_degree: 1
  sequence_parallel: True
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 8
  pp_degree: 16
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8

  logging_freq: 10


Model:
  vocab_size: 50304
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  fused_linear: True


Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 16
    sharding_stage: 2
    sharding_offload: False
    reduce_overlap: True
    broadcast_overlap: True


Optimizer:
  tensor_fusion: True


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1

  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "O2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  name: "GPT"
  fused_linear: False
  fuse_attn_qkv: True
  scale_qk_by_layer_num: True
  sequence_parallel: False
  no_recompute_layers:
  vocab_size_divisible_unit: 128
  fused_softmax_with_triangular: True


Data:
  Train:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [969, 30, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn
  
  Eval:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [969, 30, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 360000
    warmup_rate: 0.01
    max_lr: 5.0e-5
    min_lr: 1.0e-5
    use_increments: True
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False

Distributed:
  fuse_sequence_parallel_allreduce: False


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_cn_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  name: "GPT-cn"
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/prune_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8

  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.0
  attention_probs_dropout_prob: 0.0
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    comm_overlap: False


Optimizer:
  weight_decay: 0.0
  lr:
    decay_steps: 90000
    warmup_rate: 0.00
    max_lr: 2.5e-5
    min_lr: 5.0e-6
    

Compress:
  pretrained:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/qat_gpt_345M_mp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 1


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 8
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
    freeze_embedding: True
    skip_tensor_map: 
      block_3: ['linear2']
      block_5: ['linear1']
      block_6: ['linear2']
      block_7: ['linear2']
      block_10: ['linear2']
      block_20: ['linear2']
      block_21: ['linear2']


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/qat_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
    freeze_embedding: True
    skip_tensor_map: 
      block_3: ['linear2']
      block_5: ['linear1']
      block_6: ['linear2']
      block_7: ['linear2']
      block_10: ['linear2']
      block_20: ['linear2']
      block_21: ['linear2']


================================================
FILE: examples/transformer/models/GPT/pretrain/configs/qat_gpt_6.7B_sharding16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8

  logging_freq: 10


Model:
  vocab_size: 50304
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  fused_linear: True


Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 16
    sharding_stage: 2
    sharding_offload: False
    reduce_overlap: True
    broadcast_overlap: True


Optimizer:
  tensor_fusion: True


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: examples/transformer/models/GPT/pretrain/export.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist
from paddle.static import InputSpec

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from ppfleetx.utils.export import export_inference_model
from examples.transformer.utils import qat
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)

    cfg.process_configs(config)
    cfg.print_config(config)

    if config.Global.mix_precision.enable:
        logger.info("NOTE: disable mix_precision in export mode")

    # build GPT model
    model, _, _ = impls.build_model(config)

    # export
    model.eval()
    input_spec = [
        InputSpec(
            shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                shape=[None, None], name="ids", dtype='int64')
    ]

    output_dir = config.Global.save_load.output_dir
    dp_rank = 0 if nranks == 1 else env.get_hcg().get_data_parallel_rank()
    save_dir = os.path.join(output_dir, "rank_{}".format(dp_rank))

    quanter = None
    quant_mode = False

    if 'Compress' in config:
        mode = 'compress'
        compress_configs = config['Compress']

        if "Quantization" in compress_configs:
            quant_mode = True

        model, quanter = qat.compress_model(config, model, input_spec)

    # load pretrained checkpoints
    if config.Global.save_load.ckpt_dir is not None:
        io.load(
            config.Global.save_load.ckpt_dir,
            model,
            optimizer=None,
            mode='export',
            load_recovery=None)

    if not quant_mode:
        export_inference_model(model, input_spec, save_dir, 'model')
    else:
        logger.info("export quantized model.")
        export_inference_model(
            model,
            input_spec,
            save_dir,
            'model',
            export_quant_model=True,
            quanter=quanter)


================================================
FILE: examples/transformer/models/GPT/pretrain/impls.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
import paddle.distributed as dist
from paddle.optimizer.lr import LRScheduler
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import env, amp
import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters
from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer
from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


def _get_model_size(l, h, v, s):
    P = 0
    # embedding
    P += (v + s) * h
    # attention
    P += (4 * h * h + 4 * h) * l
    # layer_norm of decoder
    P += (2 * (2 * h)) * l
    # FFN Layer
    P += (8 * h * h + 5 * h) * l
    # layer_norm of transformer
    P += 2 * h
    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))


def _vocab_size_with_padding(vocab_size, div_unit, mp_degree):
    padded_size = vocab_size
    multiple = div_unit * mp_degree
    while (padded_size % multiple) != 0:
        padded_size += 1
    logger.warning(' > padded vocab (size: {}) with {} dummy tokens '
                   '(new size: {})'.format(vocab_size, padded_size -
                                           vocab_size, padded_size))
    return padded_size


def build_model(config):
    nranks = dist.get_world_size()
    model_setting = copy.deepcopy(config.Model)

    if 'Compress' in config and 'Quantization' in config.Compress:
        quant_setting = copy.deepcopy(config.Compress.Quantization)
        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',
                                                             {})
        model_setting['freeze_embedding'] = quant_setting.get(
            'freeze_embedding', False)

    model_name = model_setting.pop("name")
    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
    tokenizer = tokenizer_class.from_pretrained(pretrained_name)

    model_setting['vocab_size'] = _vocab_size_with_padding(
        model_setting.get('vocab_size', tokenizer.vocab_size),
        model_setting.pop('vocab_size_divisible_unit', 128),
        config.Distributed.get('mp_degree', 1))

    l = model_setting['num_layers']
    h = model_setting['hidden_size']
    v = model_setting['vocab_size']
    s = config.Data.Train.dataset.max_seq_len
    _get_model_size(l, h, v, s)

    if nranks == 1:
        model_setting.pop("sequence_parallel")
        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
    else:
        model_setting['num_partitions'] = config.Distributed.mp_degree
        if config.Distributed.pp_degree == 1:
            model_setting.pop("virtual_pp_degree", None)
            model = gpt.GPTForPretrainingHybrid(
                gpt.GPTModelHybrid(**model_setting))
        else:
            model = gpt.GPTForPretrainingPipe(**model_setting)

    if config.Model.sequence_parallel:
        register_sequence_parallel_allreduce_hooks(
            model, config.Global.accumulate_steps,
            config.Distributed.fuse_sequence_parallel_allreduce)

    if nranks == 1:
        loss_fn = gpt.GPTPretrainingCriterion()
    else:
        loss_fn = gpt.GPTPretrainingCriterionHybird(
            sequence_parallel=config.Model.sequence_parallel)

    return model, tokenizer, loss_fn


def model_forward_backward(config, batch, forward_func, **kwargs):
    acc_steps = config.Global.accumulate_steps
    amp_enable = config.Global.mix_precision.enable
    amp_dtype = config.Global.mix_precision.dtype
    amp_level = config.Global.mix_precision.level
    black_list = config.Global.mix_precision.custom_black_list
    white_list = config.Global.mix_precision.custom_white_list

    # train with pipeline strategy
    if config.Distributed.pp_degree > 1:
        tokens, position_ids, labels, loss_mask = batch
        batch = [(tokens, position_ids), (labels, loss_mask)]

        batches = [batch]

        with paddle.amp.auto_cast(
                amp_enable,
                custom_black_list=black_list,
                custom_white_list=white_list,
                dtype=amp_dtype,
                level=amp_level):

            batch = kwargs['model']._prepare_training(
                batch, kwargs['optimizer'], None)
            loss = kwargs['model'].forward_backward_pipeline(batch,
                                                             kwargs['scaler'])

        return loss

    # train with non-pipeline strategy
    if acc_steps == 1:
        batches = [batch]
    else:
        split_batches = [paddle.split(b, acc_steps) for b in batch]
        batches = []
        for i in range(len(split_batches[0])):
            micro_batch = [split_batch[i] for split_batch in split_batches]
            batches.append(micro_batch)

    # gradient merge strategy
    final_loss = None
    for micro_batch in batches:
        with paddle.amp.auto_cast(
                amp_enable,
                custom_black_list=black_list,
                custom_white_list=white_list,
                dtype=amp_dtype,
                level=amp_level):

            # forward in training step
            loss = forward_func(micro_batch, kwargs['model'],
                                kwargs['loss_fn'])

        loss_bw = kwargs['scaler'].scale(
            loss) if amp_enable and amp_dtype == "float16" else loss
        loss_bw = loss_bw / acc_steps if acc_steps > 1 else loss_bw
        loss_bw.backward()

        detach_loss = loss.detach()
        if final_loss is None:
            final_loss = detach_loss
        else:
            final_loss = paddle.add(final_loss, detach_loss)

    final_loss = final_loss / acc_steps if acc_steps > 1 else final_loss

    return final_loss


def optim_update_params(config, **kwargs):
    hcg = env.get_hcg()
    amp_enable = config.Global.mix_precision.enable
    amp_dtype = config.Global.mix_precision.dtype

    dp_degree = config.Distributed.dp_degree
    sharding_stage = config.Distributed.sharding.sharding_stage

    if config.Model.use_recompute and isinstance(kwargs['model'],
                                                 paddle.DataParallel):
        if not hasattr(kwargs['optimizer'], "all_fused_tensors") or kwargs[
                'optimizer'].all_fused_tensors is None:
            fused_allreduce_gradients(list(kwargs['model'].parameters()), None)
        else:
            dp_group = hcg.get_data_parallel_group()
            all_reduce_parameters(kwargs['optimizer'].all_fused_tensors,
                                  dp_group)
    elif isinstance(kwargs['model'], amp.MixPrecisionLayer) \
        and dist.get_world_size() > 1 and dist.get_world_size() == dp_degree:
        fused_allreduce_gradients(list(kwargs['model'].parameters()), None)

    if sharding_stage == 3 and dp_degree > 1:
        dp_group = hcg.get_data_parallel_group()
        fused_allreduce_gradients(kwargs['model'].parameters(), hcg)

        for p in kwargs['model'].parameters():
            if hasattr(p, "bw_storage"):
                assert p.grad is None, "This case shouldn't happen."
                p.bw_storage.scale_(1.0 / dp_group.nranks)
                dist.all_reduce(p.bw_storage, group=dp_group)

    if amp_enable and amp_dtype == 'float16':
        kwargs['scaler'].step(kwargs['optimizer'])
        kwargs['scaler'].update()
    else:
        kwargs['optimizer'].step()


def fit_impl(config, batch, forward_func, **kwargs):
    kwargs['model'].train()

    if config.Distributed.pp_degree == 1:
        if config.Model.use_recompute and isinstance(kwargs['model'],
                                                     paddle.DataParallel):
            with kwargs['model'].no_sync():
                loss = model_forward_backward(config, batch, forward_func,
                                              **kwargs)
        else:
            loss = model_forward_backward(config, batch, forward_func,
                                          **kwargs)
    else:
        loss = model_forward_backward(config, batch, forward_func, **kwargs)

    optim_update_params(config, **kwargs)

    return loss


@paddle.no_grad()
def eval_impl(config, batch, model, loss_fn):
    model.eval()

    amp_enable = config.Global.mix_precision.enable
    amp_dtype = config.Global.mix_precision.dtype
    amp_level = config.Global.mix_precision.level
    black_list = config.Global.mix_precision.custom_black_list
    white_list = config.Global.mix_precision.custom_white_list

    with paddle.amp.auto_cast(
            amp_enable,
            custom_black_list=black_list,
            custom_white_list=white_list,
            dtype=amp_dtype,
            level=amp_level):
        tokens, position_ids, labels, loss_mask = batch

        if config.Distributed.pp_degree == 1:
            tokens, position_ids, labels, loss_mask = batch
            preds = model(tokens, position_ids)
            preds = paddle.cast(preds, dtype="float32")
            loss = loss_fn(preds, labels, loss_mask)
        else:
            batch = [(tokens, position_ids), (labels, loss_mask)]
            loss = model.eval_batch(batch, compute_loss=True)

    return loss


================================================
FILE: examples/transformer/models/GPT/pretrain/run.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist
from paddle.static import InputSpec

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io, amp
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from examples.transformer.utils import qat
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(config.Global.device)

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)

    cfg.process_configs(config)
    cfg.print_config(config)

    # Note: Only for GPTDataset
    dataset_kwargs = {
        "seed": config.Global.seed,
        "model_type": config.Model.name,
    }
    sampler_kwargs = {"batch_size": config.Global.local_batch_size, }

    # build dataloader for training/eval
    dataset_kwargs.update({"mode": "Train"})
    dataset = cpn.build_dataset(config.Data.Train.dataset, **dataset_kwargs)
    sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset,
                                      **sampler_kwargs)
    train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset,
                                             sampler)

    dataset_kwargs.update({"mode": "Eval"})
    dataset = cpn.build_dataset(config.Data.Eval.dataset, **dataset_kwargs)
    sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset,
                                      **sampler_kwargs)
    valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset,
                                             sampler)

    # build GPT model
    model, tokenizer, loss_fn = impls.build_model(config)

    if 'Compress' in config:
        input_spec = [
            InputSpec(
                shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                    shape=[None, None], name="ids", dtype='int64')
        ]
        model, quanter = qat.compress_model(config, model, input_spec)

    amp_config = config.Global.mix_precision
    amp_enable = amp_config['enable']
    amp_dtype = amp_config.get('dtype', 'float16')
    amp_level = amp_config.get('level', 'O2')
    amp_use_main_grad = amp_config.get('use_main_grad', False)
    amp_scale_loss = amp_config.get('scale_loss', 32768)

    if amp_enable:
        if amp_dtype == "float16":
            scaler = paddle.amp.GradScaler(init_loss_scaling=amp_scale_loss)
        elif amp_dtype == "bfloat16":
            scaler = paddle.amp.GradScaler(
                init_loss_scaling=1, use_dynamic_loss_scaling=False)

        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when 
        # training with pure fp16 strategy, but will cause the rise of memory.
        model = paddle.amp.decorate(
            models=model, level=amp_level, dtype=amp_dtype)
    else:
        scaler = None

    config.Optimizer.lr.update({
        'epochs': config.Global.num_train_epochs,
        'step_each_epoch': len(train_data_loader),
        'total_steps': config.Global.max_steps,
    })

    use_increments = config.Optimizer.lr.pop('use_increments', False)

    # build lr and optim
    lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr)
    optimizer = cpn.build_optimizer(
        config.Optimizer,
        model,
        lr_scheduler,
        multi_precision=config.Global.mix_precision.enable)

    if amp_enable and amp_dtype in [
            'float16', 'bfloat16'
    ] and amp_level == 'O2' and amp_use_main_grad:
        model = amp.MixPrecisionLayer(model, dtype=amp_dtype)
        optimizer = amp.MixPrecisionOptimizer(optimizer)
        scaler = amp.MixPrecisionScaler(scaler)

    # call fleet wrapper
    if nranks > 1:
        model, optimizer, scaler = strategy.wrap_with_fleet(
            config.Distributed, model, optimizer, scaler)

    # load pretrained checkpoints
    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}
    if config.Global.save_load.ckpt_dir is not None:
        io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train',
                load_recovery)

    # build profiler
    if config.get('Profiler', {}).get('enable', False):
        profiler = cpn.build_profiler(config.Profiler)
    else:
        profiler = None

    # start training
    train_start = log.get_timestamp()

    if load_recovery['rng_state'] != -1:
        paddle.set_cuda_rng_state(load_recovery['rng_state'])

    for epoch_index in range(load_recovery['epoch'],
                             config.Global.num_train_epochs):
        train_epoch_start = log.get_timestamp()

        # time count
        train_losses = []
        train_step_start = log.get_timestamp()

        # Note(GuoxiaWang): Do not use len(train_data_loader()),
        # it will cause a memory leak.
        total_train_batch = len(train_data_loader)
        total_train_step = config.Global.max_steps
        total_eval_batch = len(
            valid_data_loader) if valid_data_loader is not None else 0
        valid_data_loader = valid_data_loader(
        ) if valid_data_loader is not None else None
        eval_finished_step = 0
        for step, batch in enumerate(train_data_loader()):
            if epoch_index == load_recovery['epoch']:
                if step < load_recovery['step']:
                    continue

            model.train()
            fit_kwargs = {
                "model": model,
                "loss_fn": loss_fn,
                "scaler": scaler,
                "optimizer": optimizer,
            }

            def forward_func(batch, model, loss_fn):
                tokens, position_ids, labels, loss_mask = batch

                loss_mask.stop_gradient = True
                labels.stop_gradient = True
                position_ids.stop_gradient = True

                preds = model(tokens, position_ids)
                loss = loss_fn(preds, labels, loss_mask)

                return loss

            loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs)
            train_losses.append(loss)

            if lr_scheduler is not None:
                if scaler is None or scaler._found_inf == 0:
                    lr_scheduler.step(epoch=config.Global.global_batch_size
                                      if use_increments else None)

            # training step log
            if (step + 1) % config.Global.logging_freq == 0:
                train_step_cost = log.get_timestamp() - train_step_start
                numpy_losses = [float(loss) for loss in train_losses]

                train_cost = train_step_cost \
                    if step == 0 else train_step_cost / config.Global.logging_freq
                speed = 1. / train_cost
                default_global_tokens_num = config.Global.global_batch_size * \
                    config.Data.Train.dataset.max_seq_len
                ips_total = speed * default_global_tokens_num
                ips = ips_total / env.get_data_world_size()

                loss_scale_str = " loss_scale: %.9f," % (
                    scaler._scale.numpy()[0]) if scaler is not None else ""

                logger.info(
                    "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \
                    "ips_total: %.0f tokens/s, ips: %.0f tokens/s,%s learning rate: %.5e, found_inf: %d"
                    % (epoch_index, config.Global.num_train_epochs, step, total_train_step, sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips, loss_scale_str, optimizer.get_lr(), scaler._found_inf if scaler is not None else 0))

                train_step_start = log.get_timestamp()
                train_losses = []

            optimizer.clear_grad()

            # start eval
            if step > 0 and config.Global.eval_freq > 0 and step % config.Global.eval_freq == 0:
                eval_losses = []
                eval_step_start = log.get_timestamp()

                for eval_step, batch in enumerate(valid_data_loader):
                    eval_finished_step += 1
                    loss = impls.eval_impl(config, batch, model, loss_fn)
                    eval_losses.append(loss)

                    if eval_step >= config.Global.eval_iters - 1:
                        break

                eval_step_cost = log.get_timestamp() - eval_step_start
                eval_loss = sum(eval_losses) / len(eval_losses)
                eval_cost = eval_step_cost / config.Global.logging_freq

                logger.info(
                    "[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
                    % (epoch_index, eval_step, eval_finished_step,
                       float(eval_loss), eval_cost, 1. / eval_cost))

            if step > 0 and config.Global.save_load.save_steps > 0 and \
                step % config.Global.save_load.save_steps == 0:
                device.synchronize()
                io.save(
                    config.Global.save_load.output_dir,
                    model,
                    optimizer,
                    step=step,
                    epoch=epoch_index,
                    sharding_stage=config.Distributed.sharding.sharding_stage)

            if step >= config.Global.max_steps:
                break

            if profiler:
                profiler.step()

        # training epoch log
        train_epoch_cost = log.get_timestamp() - train_epoch_start
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (epoch_index, train_epoch_cost))

    # training end log
    logger.info(
        "The training process is complete and total cost of time for training is : {}".
        format(
            log.convert_timestamp_to_data(log.get_timestamp() - train_start)))

    if profiler:
        cpn.profiler_done(profiler, config.Profiler)


================================================
FILE: examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml
================================================
_base_: ./pretrain_moe_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 2
  max_steps: 20000
  logging_freq: 10
  mix_precision:
    enable: True

Data:
  Train:
    dataset:
      split: [98,2,0]
    loader:
      num_workers: 0
  Eval:
    dataset:
      split: [98,2,0]

Model:
  vocab_size: 50304
  hidden_size: 768
  num_layers: 12
  num_attention_heads: 12
  ffn_hidden_size: 3072
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.014
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  num_experts: 2,
  expert_interval: 2
  topk: 1
  moe_use_residual: False #True
  moe_train_capacity_factor: 1.0
  moe_eval_capacity_factor: 1.0
  moe_min_capacity: 4
  moe_token_dropping: True
  balance_loss_weight: 0.01
  enable_expert_tensor_parallelism: False


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml
================================================
Global:
  device: gpu
  seed: 1234

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1

  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 1000
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  name: "GPT"
  fused_linear: False
  fuse_attn_qkv: True
  sequence_parallel: False
  no_recompute_layers:


Data:
  Train:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn
  
  Eval:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


Optimizer:
  name: FusedAdamW
  weight_decay: 0.1
  beta1: 0.9
  beta2: 0.95
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 20000
    warmup_rate: 0.01
    max_lr: 4.5e-4
    min_lr: 4.5e-6
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False

Distributed:
  fuse_sequence_parallel_allreduce: False


================================================
FILE: examples/transformer/models/GPT/pretrain_moe/impls.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import numpy as np

import paddle
import paddle.distributed as dist
from paddle.optimizer.lr import LRScheduler
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients

from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import env
import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters
from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer
from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


def _get_model_size(l, h, v, s, ne, ei):
    assert len(ne) == 1 or len(ne) == l // ei, \
            'num_experts must be either a single value or a list of the same length as the number of MoE layers'
    P = 0
    # embedding
    P += (v + s) * h
    logger.info(f'vs: {v} {s}')
    moe_mode = True
    if len(ne) == 1:
        if ne[0] == 1:
            moe_mode = False
        ne = ne * (l // ei)
    for i in range(l):
        # attention
        P += 4 * h * h + 4 * h
        # layer_norm of decoder
        P += 2 * (2 * h)
        # MoE Layer
        if ((i + 1) % ei == 0) and moe_mode:
            nei = ne[i // ei]
            # gate
            P += (h * nei + nei)
            # experts
            P += nei * (8 * h * h + 5 * h)
        # FFN Layer
        else:
            P += 8 * h * h + 5 * h
    # layer_norm of transformer
    P += 2 * h
    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))


def build_model(config):
    nranks = dist.get_world_size()
    model_setting = copy.deepcopy(config.Model)

    if 'Compress' in config and 'Quantization' in config.Compress:
        quant_setting = copy.deepcopy(config.Compress.Quantization)
        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',
                                                             {})
        model_setting['freeze_embedding'] = quant_setting.get(
            'freeze_embedding', False)

    l = model_setting['num_layers']
    h = model_setting['hidden_size']
    v = model_setting['vocab_size']
    s = model_setting['max_position_embeddings']
    ne = model_setting['num_experts']
    ei = model_setting['expert_interval']
    _get_model_size(l, h, v, s, ne, ei)

    model_name = model_setting.pop("name")
    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
    tokenizer = tokenizer_class.from_pretrained(pretrained_name)

    model_setting.pop("balance_loss_weight")
    if nranks == 1:
        model_setting.pop("sequence_parallel")
        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
    else:
        model_setting['num_partitions'] = config.Distributed.mp_degree
        if config.Distributed.pp_degree == 1:
            model_setting.pop("virtual_pp_degree", None)
            model = gpt.GPTForPretrainingHybrid(
                gpt.GPTModelHybrid(**model_setting))
        else:
            model = gpt.GPTForPretrainingPipe(**model_setting)

    if config.Model.sequence_parallel:
        register_sequence_parallel_allreduce_hooks(
            model, config.Global.accumulate_steps,
            config.Distributed.fuse_sequence_parallel_allreduce)

    if nranks == 1:
        loss_fn = gpt.GPTPretrainingCriterion()
    else:
        loss_fn = gpt.GPTPretrainingCriterionHybird(
            sequence_parallel=config.Model.sequence_parallel)

    return model, tokenizer, loss_fn


def model_forward_backward(config, batch, forward_func, **kwargs):
    acc_steps = config.Global.accumulate_steps
    use_fp16 = config.Global.mix_precision.enable
    black_list = config.Global.mix_precision.custom_black_list
    white_list = config.Global.mix_precision.custom_white_list

    # HACK： add 'expand' to black_list (put_along_axis_)
    black_list.append('expand_v2')
    # train with pipeline strategy
    if config.Distributed.pp_degree > 1:
        tokens, position_ids, labels, loss_mask = batch
        batch = [(tokens, position_ids), (labels, loss_mask)]

        batches = [batch]

        with paddle.amp.auto_cast(
                use_fp16,
                custom_black_list=black_list,
                custom_white_list=white_list,
                level='O2'):

            batch = kwargs['model']._prepare_training(
                batch, kwargs['optimizer'], None)
            loss = kwargs['model'].forward_backward_pipeline(batch,
                                                             kwargs['scaler'])

        return loss

    # train with non-pipeline strategy
    if acc_steps == 1:
        batches = [batch]
    else:
        split_batches = [paddle.split(b, acc_steps) for b in batch]
        batches = []
        for i in range(len(split_batches[0])):
            micro_batch = [split_batch[i] for split_batch in split_batches]
            batches.append(micro_batch)

    # gradient merge strategy
    final_loss = None
    for micro_batch in batches:
        with paddle.amp.auto_cast(
                use_fp16,
                custom_black_list=black_list,
                custom_white_list=white_list,
                level='O2'):

            # forward in training step
            loss = forward_func(micro_batch, kwargs['model'],
                                kwargs['loss_fn'])

        # calculate auxiliary loss to balance experts' load
        if max(config.Model.
               num_experts) > 1 and config.Model.balance_loss_weight:
            aux_loss_list = [
                l.moe_mlp.fleetx_moe.get_loss()
                for l in kwargs['model'].gpt.decoder.layers
                if l.moe_mlp is not None
            ]
            bal_loss = paddle.concat(aux_loss_list)
            if bal_loss.dtype == paddle.float16:
                bal_loss = paddle.cast(bal_loss, dtype=paddle.float32)
            bal_loss = bal_loss.mean()
            loss += bal_loss * config.Model.balance_loss_weight
        loss_bw = kwargs['scaler'].scale(loss) if use_fp16 else loss
        loss_bw = loss_bw / acc_steps if acc_steps > 1 else loss_bw
        loss_bw.backward()

        detach_loss = loss.detach()
        if final_loss is None:
            final_loss = detach_loss
        else:
            final_loss = paddle.add(final_loss, detach_loss)

    final_loss = final_loss / acc_steps if acc_steps > 1 else final_loss

    return final_loss


def optim_update_params(config, **kwargs):
    hcg = env.get_hcg()
    use_fp16 = config.Global.mix_precision.enable

    dp_degree = config.Distributed.dp_degree
    sharding_stage = config.Distributed.sharding.sharding_stage

    if config.Model.use_recompute and isinstance(kwargs['model'],
                                                 paddle.DataParallel):
        if not hasattr(kwargs['optimizer'], "all_fused_tensors") or kwargs[
                'optimizer'].all_fused_tensors is None:
            fused_allreduce_gradients(list(kwargs['model'].parameters()), None)
        else:
            dp_group = hcg.get_data_parallel_group()
            all_reduce_parameters(kwargs['optimizer'].all_fused_tensors,
                                  dp_group)

    if sharding_stage == 3 and dp_degree > 1:
        dp_group = hcg.get_data_parallel_group()
        fused_allreduce_gradients(kwargs['model'].parameters(), hcg)

        for p in kwargs['model'].parameters():
            if hasattr(p, "bw_storage"):
                assert p.grad is None, "This case shouldn't happen."
                p.bw_storage.scale_(1.0 / dp_group.nranks)
                dist.all_reduce(p.bw_storage, group=dp_group)

    if use_fp16:
        kwargs['scaler'].step(kwargs['optimizer'])
        kwargs['scaler'].update()
    else:
        kwargs['optimizer'].step()


def fit_impl(config, batch, forward_func, **kwargs):
    kwargs['model'].train()

    if config.Distributed.pp_degree == 1:
        if config.Model.use_recompute and isinstance(kwargs['model'],
                                                     paddle.DataParallel):
            with kwargs['model'].no_sync():
                loss = model_forward_backward(config, batch, forward_func,
                                              **kwargs)
        else:
            loss = model_forward_backward(config, batch, forward_func,
                                          **kwargs)
    else:
        loss = model_forward_backward(config, batch, forward_func, **kwargs)

    optim_update_params(config, **kwargs)

    return loss


@paddle.no_grad()
def eval_impl(config, batch, model, loss_fn):
    model.eval()

    use_fp16 = config.Global.mix_precision.enable
    black_list = config.Global.mix_precision.custom_black_list
    white_list = config.Global.mix_precision.custom_white_list

    with paddle.amp.auto_cast(
            use_fp16,
            custom_black_list=black_list,
            custom_white_list=white_list,
            level='O2'):
        tokens, position_ids, labels, loss_mask = batch

        if config.Distributed.pp_degree == 1:
            tokens, position_ids, labels, loss_mask = batch
            preds = model(tokens, position_ids)
            preds = paddle.cast(preds, dtype="float32")
            loss = loss_fn(preds, labels, loss_mask)
        else:
            batch = [(tokens, position_ids), (labels, loss_mask)]
            loss = model.eval_batch(batch, compute_loss=True)

    return loss


================================================
FILE: examples/transformer/models/GPT/pretrain_moe/run.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist
from paddle.static import InputSpec

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(1, os.path.abspath(os.path.join(__dir__, '../../../../../')))

from ppfleetx.distributed.apis import env, strategy, io
from ppfleetx.utils.log import logger
from ppfleetx.utils import device, log
from examples.transformer.utils import config as cfg
from examples.transformer.utils import components as cpn

import impls

if __name__ == "__main__":
    # parse config from yaml
    args = cfg.parse_args()
    config = cfg.get_config(args.config, overrides=args.override, show=True)

    # HACK: use certain device
    paddle.set_device(config.Global.device + ':3')

    # init distributed env
    nranks = dist.get_world_size()
    if nranks > 1:
        env.init_dist_env(config)

    env.set_seed(config.Global.seed)

    cfg.process_configs(config)
    cfg.print_config(config)

    # Note: Only for GPTDataset
    dataset_kwargs = {
        "seed": config.Global.seed,
        "model_type": config.Model.name,
    }
    sampler_kwargs = {"batch_size": config.Global.local_batch_size, }

    # build dataloader for training/eval
    dataset_kwargs.update({"mode": "Train"})
    dataset = cpn.build_dataset(config.Data.Train.dataset, **dataset_kwargs)
    sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset,
                                      **sampler_kwargs)
    train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset,
                                             sampler)

    dataset_kwargs.update({"mode": "Eval"})
    dataset = cpn.build_dataset(config.Data.Eval.dataset, **dataset_kwargs)
    sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset,
                                      **sampler_kwargs)
    valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset,
                                             sampler)

    # build GPT model
    model, tokenizer, loss_fn = impls.build_model(config)

    if 'Compress' in config:
        from examples.transformer.utils import qat
        input_spec = [
            InputSpec(
                shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                    shape=[None, None], name="ids", dtype='int64')
        ]
        model, quanter = qat.compress_model(config, model, input_spec)

    if config.Global.mix_precision.enable:
        scaler = paddle.amp.GradScaler(
            init_loss_scaling=config.Global.mix_precision.scale_loss)
        # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when 
        # training with pure fp16 strategy, but will cause the rise of memory.
        model = paddle.amp.decorate(models=model, level='O2')
    else:
        scaler = None

    config.Optimizer.lr.update({
        'epochs': config.Global.num_train_epochs,
        'step_each_epoch': len(train_data_loader),
        'total_steps': config.Global.max_steps,
    })

    # build lr and optim
    lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr)
    optimizer = cpn.build_optimizer(
        config.Optimizer,
        model,
        lr_scheduler,
        multi_precision=config.Global.mix_precision.enable)

    # call fleet wrapper
    if nranks > 1:
        model, optimizer, scaler = strategy.wrap_with_fleet(
            config.Distributed, model, optimizer, scaler)

    # load pretrained checkpoints
    load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}
    if config.Global.save_load.ckpt_dir is not None:
        io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train',
                load_recovery)

    # build profiler
    if config.get('Profiler', {}).get('enable', False):
        profiler = cpn.build_profiler(config.Profiler)
    else:
        profiler = None

    # start training
    train_start = log.get_timestamp()

    if load_recovery['rng_state'] != -1:
        paddle.set_cuda_rng_state(load_recovery['rng_state'])

    for epoch_index in range(load_recovery['epoch'],
                             config.Global.num_train_epochs):
        train_epoch_start = log.get_timestamp()

        # time count
        train_losses = []
        train_step_start = log.get_timestamp()

        # Note(GuoxiaWang): Do not use len(train_data_loader()),
        # it will cause a memory leak.
        total_train_batch = len(train_data_loader)
        total_eval_batch = len(
            valid_data_loader) if valid_data_loader is not None else 0
        for step, batch in enumerate(train_data_loader):
            if epoch_index == load_recovery['epoch']:
                if step <= load_recovery['step']:
                    continue

            model.train()
            fit_kwargs = {
                "model": model,
                "loss_fn": loss_fn,
                "scaler": scaler,
                "optimizer": optimizer,
            }

            def forward_func(batch, model, loss_fn):
                tokens, position_ids, labels, loss_mask = batch

                loss_mask.stop_gradient = True
                labels.stop_gradient = True
                position_ids.stop_gradient = True

                preds = model(tokens, position_ids)
                loss = loss_fn(preds, labels, loss_mask)

                return loss

            loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs)
            train_losses.append(loss)

            # training step log
            if (step + 1) % config.Global.logging_freq == 0:
                train_step_cost = log.get_timestamp() - train_step_start
                numpy_losses = [loss.numpy()[0] for loss in train_losses]

                train_cost = train_step_cost \
                    if step == 0 else train_step_cost / config.Global.logging_freq
                speed = 1. / train_cost
                default_global_tokens_num = config.Global.global_batch_size * \
                    config.Data.Train.dataset.max_seq_len
                ips_total = speed * default_global_tokens_num
                ips = ips_total / env.get_data_world_size()

                logger.info(
                    "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \
                    "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e"
                    % (epoch_index, step, sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips, optimizer.get_lr()))

                train_step_start = log.get_timestamp()
                train_losses = []

            if lr_scheduler is not None:
                lr_scheduler.step()

            optimizer.clear_grad()

            # start eval
            if step > 0 and config.Global.eval_freq > 0 and step % config.Global.eval_freq == 0:
                eval_losses = []
                eval_step_start = log.get_timestamp()

                for eval_step, batch in enumerate(valid_data_loader):
                    loss = impls.eval_impl(config, batch, model, loss_fn)
                    eval_losses.append(loss)

                    if eval_step >= config.Global.eval_iters - 1:
                        break

                eval_step_cost = log.get_timestamp() - eval_step_start
                eval_loss = sum(eval_losses) / len(eval_losses)
                eval_cost = eval_step_cost / config.Global.logging_freq

                logger.info(
                    "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
                    % (epoch_index, eval_step, eval_loss.numpy()[0], eval_cost,
                       1. / eval_cost))

            if step > 0 and config.Global.save_load.save_steps > 0 and \
                step % config.Global.save_load.save_steps == 0:
                device.synchronize()
                io.save(
                    config.Global.save_load.output_dir,
                    model,
                    optimizer,
                    step=step,
                    epoch=epoch_index,
                    sharding_stage=config.Distributed.sharding.sharding_stage)

            if step >= config.Global.max_steps:
                break

            if profiler:
                profiler.step()

        # training epoch log
        train_epoch_cost = log.get_timestamp() - train_epoch_start
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (epoch_index, train_epoch_cost))

    # training end log
    logger.info(
        "The training process is complete and total cost of time for training is : {}".
        format(
            log.convert_timestamp_to_data(log.get_timestamp() - train_start)))

    if profiler:
        cpn.profiler_done(profiler, config.Profiler)


================================================
FILE: examples/transformer/utils/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: examples/transformer/utils/components.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import random
import numpy as np

import paddle
import paddle.distributed as dist
from paddle.optimizer.lr import LRScheduler
from paddle.profiler import SummaryView

from ppfleetx.data import dataset, sampler, utils
from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger
from ppfleetx.optims import optimizer, grad_clip, lr_scheduler


def build_dataset(config_dataset, **config_kwargs):
    # build dataset
    if config_dataset is not None:
        config_dataset = copy.deepcopy(config_dataset)
        dataset_name = config_dataset.pop('name')
        config_dataset.update(config_kwargs)
        dataset = eval("dataset.{}".format(dataset_name))(**config_dataset)

        logger.debug("build dataset({}) success...".format(dataset))
    else:
        dataset = None

    return dataset


def build_batch_sampler(config_sampler, dataset, **config_kwargs):
    # build sampler
    if config_sampler is not None:
        config_sampler = copy.deepcopy(config_sampler)
        sampler_name = config_sampler.pop("name")
        config_sampler.update(config_kwargs)
        batch_sampler = eval("sampler.{}".format(sampler_name))(
            dataset, **config_sampler)

        logger.debug("build batch_sampler({}) success...".format(
            batch_sampler))
    else:
        batch_sampler = None

    return batch_sampler


def build_dataloader(config_loader,
                     dataset,
                     batch_sampler=None,
                     **config_kwargs):
    collate_fn = None

    if config_loader is not None:
        config_loader = copy.deepcopy(config_loader)
        config_loader.update(config_kwargs)

        collate_fn_cfg = config_loader.pop('collate_fn', None)
        if isinstance(collate_fn_cfg, str):
            collate_fn = getattr(
                utils, collate_fn_cfg) if collate_fn_cfg is not None else None
        elif isinstance(collate_fn_cfg, dict):
            collate_fn_class_name = collate_fn_cfg.pop("name")
            collate_fn = eval("utils.{}".format(collate_fn_class_name))(
                **collate_fn_cfg)

            logger.debug("build collate_fn({}) success...".format(collate_fn))

    def worker_init_fn(worker_id):
        """ set seed in subproces for dataloader when num_workers > 0"""
        np.random.seed(env.get_dp_seed() + worker_id)
        random.seed(env.get_dp_seed() + worker_id)

    data_loader = paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=collate_fn,
        worker_init_fn=worker_init_fn,
        **config_loader)

    logger.debug("build data_loader({}) success...".format(data_loader))
    return data_loader


def build_lr_scheduler(lr_config):
    if 'name' in lr_config:
        lr_name = lr_config.pop('name')
        lr = eval("lr_scheduler.{}".format(lr_name))(**lr_config)
        if isinstance(lr, LRScheduler):
            return lr
        else:
            return lr()
    else:
        lr = lr_config.learning_rate

    logger.debug("build lr ({}) success..".format(lr))
    return lr


def build_grad_clip(grad_clip_config):
    if grad_clip_config is not None:
        grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
        grad_clip = eval("grad_clip.{}".format(grad_clip_name))(
            **grad_clip_config)
        return grad_clip
    else:
        return None


def build_optimizer(config, model, lr_scheduler=None, multi_precision=False):
    config = copy.deepcopy(config)
    if lr_scheduler is not None:
        config.pop('lr')

    grad_clip_config = config.pop('grad_clip', None)
    grad_clip = build_grad_clip(grad_clip_config)

    optim_name = config.pop('name')
    optim = eval("optimizer.{}".format(optim_name))(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        grad_clip=grad_clip,
        multi_precision=multi_precision,
        **config)

    logger.debug("build optimizer ({}) success..".format(optim))
    return optim


def build_profiler(profiler_config):
    profiler = None

    if profiler_config.get('enable', False):
        scheduler = profiler_config.get('scheduler', None)
        profiler_log = profiler_config.get('profiler_log', './profiler_log')
        record_shapes = profiler_config.get('record_shapes', True)
        profile_memory = profiler_config.get('profile_memory', True)
        profiler = paddle.profiler.Profiler(
            targets=[
                paddle.profiler.ProfilerTarget.CPU,
                paddle.profiler.ProfilerTarget.GPU
            ],
            scheduler=scheduler,
            on_trace_ready=paddle.profiler.export_chrome_tracing(profiler_log),
            record_shapes=record_shapes,
            profile_memory=profile_memory)
        profiler.start()
        logger.warning("Profiler is enabled, do not enable it in production.")

    return profiler


def profiler_done(profiler, profiler_config):
    if not profiler:
        return

    logger.info("Profiler finished, prepare to print summary...")

    profiler.stop()

    _print_summary(profiler, profiler_config)
    profiler_log = profiler_config.get('profiler_log', './profiler_log')
    logger.info(
        "For more information please install visualdl and run it with following command:"
    )
    logger.info(
        "-------------------------------------------------------------------------------"
    )
    logger.info(f"visualdl --host 0.0.0.0 --logdir {profiler_log}")
    logger.info(
        "-------------------------------------------------------------------------------"
    )


def _print_summary(profiler, profiler_config):
    views_dict = {
        SummaryView.DeviceView: 'device',
        SummaryView.OverView: 'overview',
        SummaryView.ModelView: 'model',
        SummaryView.DistributedView: 'dist',
        SummaryView.KernelView: 'kernel',
        SummaryView.OperatorView: 'op',
        SummaryView.MemoryView: 'mem',
        SummaryView.MemoryManipulationView: 'memcpy',
        SummaryView.UDFView: 'udf',
    }

    default_views = [
        SummaryView.OverView,
        SummaryView.ModelView,
        SummaryView.KernelView,
        SummaryView.OperatorView,
    ]

    def gen_views(cfg):
        # print all summary view if detailed=True
        if profiler_config.get('detailed', False):
            return None

        views = []
        # override default view with user defined value if detailed=False
        for view in SummaryView:
            v = profiler_config.get('summary', {}).get(views_dict[view], None)
            if v is True or (v is None and view in default_views):
                views.append(view)

        return views or None

    profiler.summary(
        sorted_by=paddle.profiler.SortedKeys.GPUTotal,
        views=gen_views(profiler_config))


================================================
FILE: examples/transformer/utils/config.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import copy
import argparse
import codecs
import yaml
import numpy as np

import paddle
import paddle.distributed as dist
from paddle.fluid import core
from paddle.fluid.reader import use_pinned_memory

from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger, advertise
from ppfleetx.utils import check

__all__ = ['get_config', 'print_config']


class AttrDict(dict):
    def __getattr__(self, key):
        return self[key]

    def __setattr__(self, key, value):
        if key in self.__dict__:
            self.__dict__[key] = value
        else:
            self[key] = value

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        return result

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, copy.deepcopy(v, memo))
        for k, v in self.items():
            setattr(result, k, copy.deepcopy(v, memo))
        return result

    def setdefault(self, k, default=None):
        if k not in self or self[k] is None:
            self[k] = default
            return default
        else:
            return self[k]


def create_attr_dict(yaml_config):
    from ast import literal_eval
    for key, value in yaml_config.items():
        if type(value) is dict:
            yaml_config[key] = value = AttrDict(value)
        if isinstance(value, str):
            try:
                value = literal_eval(value)
            except BaseException:
                pass
        if isinstance(value, AttrDict):
            create_attr_dict(yaml_config[key])
        else:
            yaml_config[key] = value


def parse_config(cfg_file):
    """Load a config file into AttrDict"""

    def _update_dic(dic, base_dic):
        '''Update config from dic based base_dic
        '''
        base_dic = base_dic.copy()
        dic = dic.copy()

        if dic.get('_inherited_', True) == False:
            dic.pop('_inherited_')
            return dic

        for key, val in dic.items():
            if isinstance(val, dict) and key in base_dic:
                base_dic[key] = _update_dic(val, base_dic[key])
            else:
                base_dic[key] = val
        dic = base_dic
        return dic

    def _parse_from_yaml(path):
        '''Parse a yaml file and build config'''

        with codecs.open(path, 'r', 'utf-8') as file:
            dic = yaml.load(file, Loader=yaml.FullLoader)

        if '_base_' in dic:
            cfg_dir = os.path.dirname(path)
            base_path = dic.pop('_base_')
            base_path = os.path.join(cfg_dir, base_path)
            base_dic = _parse_from_yaml(base_path)
            dic = _update_dic(dic, base_dic)
        return dic

    yaml_dict = _parse_from_yaml(cfg_file)
    yaml_config = AttrDict(yaml_dict)

    create_attr_dict(yaml_config)
    return yaml_config


def print_dict(d, delimiter=0):
    """
    Recursively visualize a dict and
    indenting acrrording by the relationship of keys.
    """
    placeholder = "-" * 60
    for k, v in sorted(d.items()):
        if isinstance(v, dict):
            logger.info("{}{} : ".format(delimiter * " ", k))
            print_dict(v, delimiter + 4)
        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
            logger.info("{}{} : ".format(delimiter * " ", k))
            for value in v:
                print_dict(value, delimiter + 4)
        else:
            logger.info("{}{} : {}".format(delimiter * " ", k, v))
        if k.isupper():
            logger.info(placeholder)


def print_config(config):
    """
    visualize configs
    Arguments:
        config: configs
    """
    advertise()
    print_dict(config)


def check_config(config):
    """
    Check config
    """
    # global_batch_size = config.get("")

    global_config = config.get('Global')
    check.check_version()
    device = global_config.get('device', 'gpu')
    device = device.lower()
    if device in ['gpu', 'xpu', 'rocm', 'npu', "cpu"]:
        check.check_device(device)
    else:
        raise ValueError(
            f"device({device}) is not in ['gpu', 'xpu', 'rocm', 'npu', 'cpu'],\n"
            "Please ensure the config option Global.device is one of these devices"
        )


def override(dl, ks, v):
    """
    Recursively replace dict of list
    Args:
        dl(dict or list): dict or list to be replaced
        ks(list): list of keys
        v(str): value to be replaced
    """

    def str2num(v):
        try:
            return eval(v)
        except Exception:
            return v

    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
    assert len(ks) > 0, ('lenght of keys should larger than 0')
    if isinstance(dl, list):
        k = str2num(ks[0])
        if len(ks) == 1:
            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
            dl[k] = str2num(v)
        else:
            override(dl[k], ks[1:], v)
    else:
        if len(ks) == 1:
            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
            if not ks[0] in dl:
                print('A new field ({}) detected!'.format(ks[0], dl))
            dl[ks[0]] = str2num(v)
        else:
            if ks[0] not in dl.keys():
                dl[ks[0]] = {}
                print("A new Series field ({}) detected!".format(ks[0], dl))
            override(dl[ks[0]], ks[1:], v)


def override_config(config, options=None):
    """
    Recursively override the config
    Args:
        config(dict): dict to be replaced
        options(list): list of pairs(key0.key1.idx.key2=value)
            such as: [
                'topk=2',
                'VALID.transforms.1.ResizeImage.resize_short=300'
            ]
    Returns:
        config(dict): replaced config
    """
    if options is not None:
        for opt in options:
            assert isinstance(opt, str), (
                "option({}) should be a str".format(opt))
            assert "=" in opt, (
                "option({}) should contain a ="
                "to distinguish between key and value".format(opt))
            pair = opt.split('=')
            assert len(pair) == 2, ("there can be only a = in the option")
            key, value = pair
            keys = key.split('.')
            override(config, keys, value)
    return config


def get_config(fname, overrides=None, show=False):
    """
    Read config from file
    """
    assert os.path.exists(fname), (
        'config file({}) is not exist'.format(fname))
    config = parse_config(fname)
    override_config(config, overrides)

    process_dist_config(config)
    process_global_configs(config)
    create_attr_dict(AttrDict(config))

    if show:
        print_config(config)
    check_config(config)
    return config


def parse_args():
    parser = argparse.ArgumentParser("train script")
    parser.add_argument(
        '-c',
        '--config',
        type=str,
        default='configs/config.yaml',
        help='config file path')
    parser.add_argument(
        '-o',
        '--override',
        action='append',
        default=[],
        help='config options to be overridden')
    args = parser.parse_args()
    return args


def is_fused_matmul_bias_supported():
    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')
    else:
        return False


def process_dist_config(configs):
    """
    process distributed strategy for hybrid parallel
    """
    nranks = dist.get_world_size()

    config = configs['Distributed']

    config.setdefault("hcg", "HybridCommunicateGroup")
    mp_degree = config.setdefault("mp_degree", 1)
    pp_degree = config.setdefault("pp_degree", 1)
    pp_recompute_interval = config.setdefault("pp_recompute_interval", 1)

    # sharding default
    sharding_config = config['sharding']
    sharding_degree = sharding_config.setdefault("sharding_degree", 1)
    sharding_stage = sharding_config.setdefault('sharding_stage', 2)
    sharding_offload = sharding_config.setdefault('sharding_offload', False)
    reduce_overlap = sharding_config.setdefault('reduce_overlap', False)
    broadcast_overlap = sharding_config.setdefault('broadcast_overlap', False)

    other_degree = mp_degree * pp_degree * sharding_degree

    assert nranks % other_degree == 0, "unreasonable config of dist_strategy."
    dp_degree = config.setdefault("dp_degree", nranks // other_degree)
    assert nranks % dp_degree == 0, "unreasonable config of dist_strategy."
    assert nranks == dp_degree * other_degree, \
        "Mismatched config using {} cards with dp_degree[{}]," \
            "mp_degree[{}], pp_degree[{}] and sharding_degree[{}]".format(nranks, \
                dp_degree, mp_degree, pp_degree, sharding_degree)

    if sharding_config['sharding_degree'] > 1 and reduce_overlap:
        if sharding_config['sharding_stage'] == 3 or sharding_config[
                'sharding_offload']:
            sharding_config['reduce_overlap'] = False
            logger.warning(
                "reduce overlap only valid for sharding stage 2 without offload"
            )

    if sharding_config['sharding_degree'] > 1 and broadcast_overlap:
        if sharding_config['sharding_stage'] == 3 or sharding_config[
                'sharding_offload']:
            sharding_config['broadcast_overlap'] = False
            logger.warning(
                "broadcast overlap only valid for sharding stage 2 without offload"
            )

    if broadcast_overlap and configs['Global']['logging_freq'] == 1:
        logger.warning(
            "Set logging_freq to 1 will disable broadcast_overlap. "
            "If you want to overlap the broadcast, please increase the logging_freq."
        )
        sharding_config['broadcast_overlap'] = False

    if sharding_config['sharding_degree'] > 1:
        if getattr(sharding_config, 'broadcast_overlap', False):
            logger.warning(
                "Enable broadcast overlap for sharding will not use pin memory for dataloader"
            )
            use_pinned_memory(False)

    if 'fuse_sequence_parallel_allreduce' not in config:
        config['fuse_sequence_parallel_allreduce'] = False


def process_global_configs(config):
    """
    process global configs for hybrid parallel
    """
    dp_degree = config['Distributed']['dp_degree']
    pp_degree = config['Distributed']['pp_degree']
    sharding_degree = config['Distributed']['sharding']['sharding_degree']

    config['Global']['enable_partial_send_recv'] = True
    if 'sequence_parallel' in config['Model'] and pp_degree > 1:
        if config['Model']['sequence_parallel']:
            config['Global']['enable_partial_send_recv'] = False
            logger.warning(
                "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \
                "config.Global.enable_partial_send_recv will be set False."
            )

    global_cfg = config['Global']

    # Set environment variable
    flags = global_cfg.get("flags", {})
    paddle.set_flags(flags)
    for k, v in flags.items():
        logger.info("Environment variable {} is set {}.".format(k, v))

    if global_cfg['global_batch_size'] is None and global_cfg[
            'local_batch_size'] is None:
        raise ValueError(
            "global_batch_size or local_batch_size should be set.")
    elif global_cfg['global_batch_size'] is not None and global_cfg[
            'local_batch_size'] is not None:
        assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == (dp_degree * sharding_degree), "global_batch_size[{}] should be divided by local_batch_size[{}] "\
            "when dp_degree is [{}] and sharding_degree is [{}]".format(global_cfg['global_batch_size'],
            global_cfg['local_batch_size'], dp_degree, sharding_degree)
    elif global_cfg['global_batch_size'] is not None and global_cfg[
            'local_batch_size'] is None:
        assert global_cfg['global_batch_size'] % (dp_degree * sharding_degree) == 0, \
            "global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]"\
            .format(global_cfg['global_batch_size'], dp_degree, sharding_degree)
        global_cfg['local_batch_size'] = global_cfg['global_batch_size'] // (
            dp_degree * sharding_degree)
    else:
        global_cfg['global_batch_size'] = global_cfg[
            'local_batch_size'] * dp_degree * sharding_degree
    assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0

    # save_load
    global_cfg['save_load'] = global_cfg.get('save_load', {})
    save_load_cfg = global_cfg.save_load
    save_steps = save_load_cfg.get('save_steps', None)
    save_epoch = save_load_cfg.get('save_epoch', None)
    if save_steps is None or save_steps == -1:
        save_load_cfg[
            'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint

    if save_epoch is None or save_epoch == -1:
        save_load_cfg['save_epoch'] = 1

    save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output')
    save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None)

    # mix_precision
    global_cfg['mix_precision'] = global_cfg.get('mix_precision', {})
    amp_cfg = global_cfg.mix_precision

    amp_cfg['enable'] = amp_cfg.get('enable', False)
    amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768)
    amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None)
    amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None)

    global_cfg['max_steps'] = global_cfg.get('max_steps', 500000)
    global_cfg['eval_freq'] = global_cfg.get('eval_freq', -1)
    global_cfg['eval_iters'] = global_cfg.get('eval_iters', 0)
    global_cfg['logging_freq'] = global_cfg.get('logging_freq', 1)
    global_cfg['num_train_epochs'] = global_cfg.get('num_train_epochs', 1)
    global_cfg['test_iters'] = global_cfg['eval_iters'] * 10 \
            if global_cfg.get('test_iters', None) is None else global_cfg['test_iters']
    global_cfg[
        'accumulate_steps'] = global_cfg.local_batch_size // global_cfg.micro_batch_size


def process_model_configs(config):
    """
    process model configs for hybrid parallel
    """
    configs = config['Model']
    if configs['ffn_hidden_size'] is None:
        configs['ffn_hidden_size'] = 4 * configs['hidden_size']

    if configs['use_recompute']:
        if not configs['recompute_granularity']:
            configs['recompute_granularity'] = 'full'
        if not configs['no_recompute_layers']:
            configs['no_recompute_layers'] = []
        else:
            assert isinstance(configs['no_recompute_layers'],
                              list), "no_recompute_layers should be a list"
            for i in configs['no_recompute_layers']:
                assert isinstance(
                    i, int
                ), "all values in no_recompute_layers should be an integer"
            assert min(configs['no_recompute_layers']) >= 0, \
                "the min value in no_recompute_layers should >= 0"
            assert max(configs['no_recompute_layers']) < configs['num_layers'], \
                "the max value in no_recompute_layers should < num_layers"
            configs['no_recompute_layers'] = sorted(
                list(set(configs['no_recompute_layers'])))

    if configs['fused_linear'] and not is_fused_matmul_bias_supported():
        configs['fused_linear'] = False
        logging.warning(
            "The flag fused_linear only valid for cuda version higher than 11.6, "
            "but the paddle is compiled with cuda " + paddle.version.cuda())

    pp_degree = config.Distributed.pp_degree

    if pp_degree > 1:
        configs['virtual_pp_degree'] = 1 \
            if configs.get('virtual_pp_degree', None) is None \
            else configs['virtual_pp_degree']
        virtual_pp_degree = configs['virtual_pp_degree']
        num_layers = configs.num_layers

        if not (num_layers % (virtual_pp_degree * pp_degree)) == 0:
            assert virtual_pp_degree == 1, "virtual pp doesn't support uneven layer split."
            logger.warning(
                "The num_layers of the model is not divisible by pp_degree." \
                "Receive num_layers: {}, pp_degree: {}.".format(num_layers, pp_degree))
        else:
            assert (num_layers %
                (virtual_pp_degree * pp_degree)) == 0, \
                "The num_layers of the model should be divisible of pp_degree * virtual_pp_degree." \
                "Receive num_layers: {}, pp_degree: {}, virtual_pp_degree: {}.".format(
                num_layers, pp_degree, virtual_pp_degree)

        if virtual_pp_degree > 1:
            local_batch_size = config.Global.local_batch_size
            micro_batch_size = config.Global.micro_batch_size
            acc_steps = local_batch_size // micro_batch_size
            assert acc_steps % pp_degree == 0, "num of microbatches {} should be divisible of pp_degree {} when " \
                                               "using interleave pipeline".format(acc_steps, pp_degree)

        if virtual_pp_degree > 2:
            logger.warning(
                "Setting virtual_pp_degree > 2 may harm the throughput of the pipeline parallel."
            )
    else:
        if configs.get('virtual_pp_degree', None):
            logger.warning("virtual_pp_degree is unuseful.")


def process_optim_configs(config):
    """
    process optim configs for hybrid parallel
    """
    if 'Optimizer' not in config.keys():
        return

    nranks = dist.get_world_size()
    dp_degree = config['Distributed']['dp_degree']
    sharding_degree = config['Distributed']['sharding']['sharding_degree']
    if config['Optimizer']['tensor_fusion']:
        assert nranks == dp_degree * sharding_degree, \
            "tensor_fusion only support single card train or data/sharding parallel train"

    if config['Optimizer']['lr']['decay_steps'] is None:
        config['Optimizer']['lr']['decay_steps'] = config['Engine'][
            'max_steps']
    config['Optimizer']['lr']['decay_steps'] *= config['Global'][
        'global_batch_size']


def process_data_configs(config):
    """
    process data configs for hybrid parallel
    """
    if 'Data' not in config.keys():
        return

    cfg_global = config['Global']
    cfg_data = config['Data']

    mode_to_num_samples = {
        "Train":
        cfg_global['global_batch_size'] * config['Global']['max_steps'],
        "Eval": cfg_global['global_batch_size'] *
        (config['Global']['max_steps'] // config['Global']['eval_freq'] + 1) *
        config['Global']['eval_iters'],
        "Test":
        cfg_global['global_batch_size'] * config['Global']['test_iters'],
    }

    for mode in ("Train", "Eval", "Test"):
        if mode in cfg_data.keys():
            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[
                mode]


def process_inference_configs(config):
    """
    process inference configs for hybrid parallel
    """
    if 'Inference' not in config.keys():
        return

    configs = config['Inference']

    if configs['model_dir'] is None:
        configs['model_dir'] = config['Global']['save_load']['output_dir']

    if configs['mp_degree'] is None:
        configs['mp_degree'] = config['Distributed']['mp_degree']


def process_configs(config):
    process_data_configs(config)
    process_model_configs(config)
    process_optim_configs(config)
    process_inference_configs(config)

    return config


================================================
FILE: examples/transformer/utils/qat.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle

from ppfleetx.distributed.apis import io
from ppfleetx.utils.compression_helper import prune_model, quant_model


def compress_model(config, model, input_spec):
    quanter, quant_configs = None, None
    prune_configs, compress_configs = None, None

    if 'Compress' in config:
        compress_configs = config['Compress']
        if "Prune" in compress_configs:
            prune_configs = compress_configs["Prune"]
        if "Quantization" in compress_configs:
            quant_configs = compress_configs["Quantization"]

        # Load pretrained model before compression
        if 'pretrained' in compress_configs and compress_configs[
                'pretrained'] is not None:
            ckpt_dir = compress_configs['pretrained']
            io.load(
                ckpt_dir,
                model,
                optimizer=None,
                mode='quant',
                load_recovery=None)

            # Avoid loading again
            config.Global.save_load.ckpt_dir = None

        if prune_configs is not None and prune_configs.enable:
            prune_model(model, prune_configs, input_spec)

        # NOTE(minghaoBD): We haven't fully tested Prune+Quantization, so an "else if" is put here for separation.
        elif quant_configs is not None and quant_configs.enable:
            model, quanter = quant_model(model, quant_configs)

    return model, quanter


================================================
FILE: ppfleetx/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml
================================================
_base_: ./imagen_base.yaml

Global:
  global_batch_size:
  local_batch_size: 1
  micro_batch_size: 1


Model:
  name: imagen_397M_text2im_64
  text_encoder_name: projects/imagen/t5/t5-11b
  text_embed_dim: 1024 
  timesteps: 1000 
  channels: 3
  cond_drop_prob: 0.1
  noise_schedules: cosine
  pred_objectives: noise
  lowres_noise_schedule: linear
  lowres_sample_noise_level: 0.2
  per_sample_random_aug_noise_level: False
  condition_on_text: True
  auto_normalize_img: True
  p2_loss_weight_gamma: 0.5
  dynamic_thresholding: True,
  dynamic_thresholding_percentile: 0.95
  only_train_unet_number: 1 
  use_recompute: False
  recompute_granularity:

Data:
  Train:
    dataset:
      name: ImagenDataset
      input_path: ./projects/imagen/filelist/laion_400M/train
      shuffle: True
      image_format: base64 
      image_size: 64 
      text_max_len: 128 
      filter_image_resolution: 64
    loader:
      num_workers: 8
      shuffle: True
      batch_size: 16 
      drop_last: True
      collate_fn: imagen_collate_fn

Loss:
  name: mse_loss
  p2_loss_weight_k: 1.0

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/multimodal/imagen/imagen_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 2500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 10
  eval_freq: 10000000
  eval_iters: 10000000
  mix_precision:
    enable: False 
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 10000
    output_dir: ./output
    ckpt_dir:


Model:
  module: "ImagenModule"
  name: "Imagen"
  fused_linear: False

# data loader for train
Data:
  Train:
    dataset:
      name: ImagenDataset
      input_path: ./projects/imagen/filelist/laion_400M/train
      shuffle: True
      image_format: base64 
      image_size: 64 
      text_max_len: 128 
      filter_image_resolution: 64
    loader:
      num_workers: 8
      shuffle: True
      batch_size: 16 
      drop_last: True
      collate_fn: imagen_collate_fn
  

Fused:
  tensor_fusion: False


Optimizer:
  name: Adam
  weight_decay: 0.
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 2500000
    warmup_rate: 0.025 
    max_lr: 1.0e-4
    min_lr: 0.0
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Inference:
  model_dir: ./output
  mp_degree: 1


================================================
FILE: ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml
================================================
_base_: ./imagen_base.yaml

Global:
  global_batch_size:
  local_batch_size: 1
  micro_batch_size: 1


Model:
  name: imagen_SR1024
  text_encoder_name: None 
  text_embed_dim: 
  timesteps: 1000 
  channels: 3
  cond_drop_prob: 0.1
  noise_schedules: cosine
  pred_objectives: noise
  lowres_cond: True 
  lowres_noise_schedule: linear
  lowres_sample_noise_level: 0.2
  per_sample_random_aug_noise_level: False
  condition_on_text: False 
  auto_normalize_img: True
  p2_loss_weight_gamma: 0.5
  dynamic_thresholding: True,
  dynamic_thresholding_percentile: 0.95
  only_train_unet_number: 1 
  is_sr: True
  use_recompute: True 
  recompute_granularity:

Engine:
  max_steps: 2500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 10
  eval_freq: 10000000
  eval_iters: 10000000
  mix_precision:
    enable: False 
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    fp16_dtype: "bfloat16"
  save_load:
    save_steps: 10000
    output_dir: ./output
    ckpt_dir:

Data:
  Train:
    dataset:
      name: ImagenDataset
      input_path: ./projects/imagen/filelist/laion_400M/train
      shuffle: True
      image_format: base64 
      image_size: 1024 
      text_max_len: 128 
      filter_image_resolution: 1024 
      sr: True
    loader:
      num_workers: 8
      shuffle: True
      batch_size: 1 
      drop_last: True
      collate_fn: imagen_collate_fn
  

Loss:
  name: mse_loss
  p2_loss_weight_k: 1.0

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml
================================================
_base_: ./imagen_base.yaml

Global:
  global_batch_size:
  local_batch_size: 1
  micro_batch_size: 1


Model:
  name: imagen_SR256
  text_encoder_name: None  # We do not use text conditoin during training.
  text_embed_dim: 
  timesteps: 1000 
  channels: 3
  cond_drop_prob: 0.1
  noise_schedules: cosine
  pred_objectives: noise
  lowres_cond: True 
  lowres_noise_schedule: linear
  lowres_sample_noise_level: 0.2
  per_sample_random_aug_noise_level: False
  condition_on_text: False 
  auto_normalize_img: True
  p2_loss_weight_gamma: 0.5
  dynamic_thresholding: True,
  dynamic_thresholding_percentile: 0.95
  only_train_unet_number: 1 
  is_sr: True
  use_recompute: True 
  recompute_granularity:

Data:
  Train:
    dataset:
      name: ImagenDataset
      input_path: ./projects/imagen/filelist/laion_400M/train
      shuffle: True
      image_format: base64 
      image_size: 256 
      text_max_len: 128 
      filter_image_resolution: 256 
      sr: True
    loader:
      num_workers: 8
      shuffle: True
      batch_size: 6 
      drop_last: True
      collate_fn: imagen_collate_fn
  

Loss:
  name: mse_loss
  p2_loss_weight_k: 1.0

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml
================================================
_base_: ./imagen_base.yaml

Global:
  global_batch_size:
  local_batch_size: 1
  micro_batch_size: 1


Model:
  name: imagen_text2im_64_debertav2
  text_encoder_name: projects/imagen/cache/deberta-v-xxlarge
  text_embed_dim: 1536
  timesteps: 1000 
  channels: 3
  cond_drop_prob: 0.1
  noise_schedules: cosine
  pred_objectives: noise
  lowres_noise_schedule: linear
  lowres_sample_noise_level: 0.2
  per_sample_random_aug_noise_level: False
  condition_on_text: True
  auto_normalize_img: True
  p2_loss_weight_gamma: 0.5
  dynamic_thresholding: True,
  dynamic_thresholding_percentile: 0.95
  only_train_unet_number: 1 
  use_recompute: False
  recompute_granularity:

Data:
  Train:
    dataset:
      name: ImagenDataset
      input_path: ./projects/imagen/filelist/laion_400M/train
      shuffle: True
      image_format: base64 
      image_size: 64 
      text_max_len: 128 
      filter_image_resolution: 64
    loader:
      num_workers: 8
      shuffle: True
      batch_size: 8 
      drop_last: True
      collate_fn: imagen_collate_fn

Loss:
  name: mse_loss
  p2_loss_weight_k: 1.0

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml
================================================
_base_: ./imagen_base.yaml

Global:
  global_batch_size:
  local_batch_size: 1
  micro_batch_size: 1


Model:
  name: imagen_text2im_64
  text_encoder_name: projects/imagen/t5/t5-11b
  text_embed_dim: 1024 
  timesteps: 1000 
  channels: 3
  cond_drop_prob: 0.1
  noise_schedules: cosine
  pred_objectives: noise
  lowres_noise_schedule: linear
  lowres_sample_noise_level: 0.2
  per_sample_random_aug_noise_level: False
  condition_on_text: True
  auto_normalize_img: True
  p2_loss_weight_gamma: 0.5
  dynamic_thresholding: True,
  dynamic_thresholding_percentile: 0.95
  only_train_unet_number: 1 
  use_recompute: True 
  recompute_granularity:

Data:
  Train:
    dataset:
      name: ImagenDataset
      input_path: ./projects/imagen/filelist/laion_400M/train
      shuffle: True
      image_format: base64 
      image_size: 64 
      text_max_len: 128 
      filter_image_resolution: 64
    loader:
      num_workers: 8
      shuffle: True
      batch_size: 8 
      drop_last: True
      collate_fn: imagen_collate_fn

Loss:
  name: mse_loss
  p2_loss_weight_k: 1.0

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml
================================================
_base_: ./finetune_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 40000
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: False


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/ernie/auto/finetune_ernie_base.yaml
================================================
Global:
  device: gpu
  seed: 1024
  binary_head: True
  
  global_batch_size: 
  local_batch_size: 16
  micro_batch_size: 16


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 1
  eval_freq: 500000
  eval_iters: 10
  test_iters: -1
  mix_precision:
    level: 
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 50000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "ErnieSeqClsModuleAuto"
  name: "Ernie"
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  intermediate_size: 3072
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: False
  use_recompute: False  


Data:
  Train:
    collate_fn: 
      name: DataCollatorWithPadding
    dataset:
      name: ErnieSeqClsDataset
      dataset_type: chnsenticorp_v2
      tokenizer_type: ernie-1.0-base-zh-cw
      max_seq_len: 512

  Eval:
    collate_fn: 
      name: DataCollatorWithPadding
    dataset:
      name: ErnieSeqClsDataset
      dataset_type: chnsenticorp_v2
      tokenizer_type: ernie-1.0-base-zh-cw
      max_seq_len: 512


Optimizer:
  name: AdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 990000
    warmup_rate: 0.01
    max_lr: 0.0001
    min_lr: 5e-05
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


================================================
FILE: ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml
================================================
Global:
  device: gpu
  seed: 1024
  binary_head: True
  
  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 1
  eval_freq: 500000
  eval_iters: 10
  test_iters: -1
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 50000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "ErnieModuleAuto"
  name: "Ernie"
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  intermediate_size: 3072
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: False
  use_recompute: False  


Data:
  Train:
    sample_split: 4
    collate_fn: 
      name: ErnieCollateData
      micro_batch_size: 
    dataset:
      name: ErnieDataset
      input_dir: ./data
      tokenizer_type: ernie-1.0-base-zh-cw
      split: [949, 50, 1]
      mode: Train
      max_seq_length: 512
      masked_lm_prob: 0.15
      short_seq_prob: 0.1
      seed: 1024
      share_folder: False
      favor_longer_ngram: False
      max_ngrams: 3

  Eval:
    sample_split: 4
    collate_fn: 
      name: ErnieCollateData
      micro_batch_size: 1
    dataset:
      name: ErnieDataset
      input_dir: ./data
      tokenizer_type: ernie-1.0-base-zh-cw
      split: [949, 50, 1]
      mode: Eval
      max_seq_length: 512
      masked_lm_prob: 0.15
      short_seq_prob: 0.1
      seed: 1024
      share_folder: False
      favor_longer_ngram: False
      max_ngrams: 3


Optimizer:
  name: AdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 990000
    warmup_rate: 0.01
    max_lr: 0.0001
    min_lr: 0.00001
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


================================================
FILE: ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base_345M_single_card.yaml
================================================
_base_: ./pretrain_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 40000
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: False


Data:
  Train:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw
  Eval:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml
================================================
_base_: ./finetune_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 40000
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: False


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml
================================================
Global:
  device: gpu
  seed: 1024
  binary_head: True
  
  global_batch_size: 
  local_batch_size: 16
  micro_batch_size: 16


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 1
  eval_freq: 500000
  eval_iters: 10
  test_iters: -1
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 50000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "ErnieSeqClsModule"
  name: "Ernie"
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  intermediate_size: 3072
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: False
  use_recompute: False  


Data:
  Train:
    dataset:
      name: ErnieSeqClsDataset
      dataset_type: chnsenticorp_v2
      tokenizer_type: ernie-1.0-base-zh-cw
      max_seq_len: 512
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 0
      return_list: False
      collate_fn: 
        name: DataCollatorWithPadding
  
  Eval:
    dataset:
      name: ErnieSeqClsDataset
      dataset_type: chnsenticorp_v2
      tokenizer_type: ernie-1.0-base-zh-cw
      max_seq_len: 512
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 0
      return_list: False
      collate_fn: 
        name: DataCollatorWithPadding


Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 990000
    warmup_rate: 0.01
    max_lr: 5e-05
    min_lr: 1e-05
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


================================================
FILE: ppfleetx/configs/nlp/ernie/inference_ernie_345M_single_card.yaml
================================================
_base_: ./finetune_ernie_345M_single_card.yaml


Inference:
  model_dir: ./output
  mp_degree: 1


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml
================================================
Global:
  device: gpu
  seed: 1024
  binary_head: True
  
  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 1
  eval_freq: 500000
  eval_iters: 10
  test_iters: -1
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 50000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "ErnieModule"
  name: "Ernie"
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  intermediate_size: 3072
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: False
  use_recompute: False  


Data:
  Train:
    dataset:
      name: ErnieDataset
      input_dir: ./data
      tokenizer_type: ernie-1.0-base-zh-cw
      split: [949, 50, 1]
      mode: Train
      max_seq_length: 512
      masked_lm_prob: 0.15
      short_seq_prob: 0.1
      seed: 1024
      share_folder: False
      favor_longer_ngram: False
      max_ngrams: 3

    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 0
      return_list: False
      collate_fn: 
        name: ErnieCollateData
        micro_batch_size: 
  
  Eval:
    dataset:
      name: ErnieDataset
      input_dir: ./data
      tokenizer_type: ernie-1.0-base-zh-cw
      split: [949, 50, 1]
      mode: Eval
      max_seq_length: 512
      masked_lm_prob: 0.15
      short_seq_prob: 0.1
      seed: 1024
      share_folder: False
      favor_longer_ngram: False
      max_ngrams: 3

    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True

    loader:
      num_workers: 1
      return_list: False
      collate_fn: 
        name: ErnieCollateData
        micro_batch_size: 1

Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 990000
    warmup_rate: 0.01
    max_lr: 0.0001
    min_lr: 0.00001
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Inference:
  model_dir: ./output
  mp_degree: 1


================================================
FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml
================================================
_base_: ./pretrain_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 512
  micro_batch_size: 1


Model:
  vocab_size: 40000
  hidden_size: 12288
  num_hidden_layers: 96
  num_attention_heads: 96
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: True


Data:
  Train:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw
  Eval:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
  dp_degree: 1
  mp_degree: 8
  pp_degree: 16
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml
================================================
_base_: ./pretrain_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 40000
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: False


Data:
  Train:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw
  Eval:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml
================================================
_base_: ./pretrain_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 1


Model:
  vocab_size: 40000
  hidden_size: 768
  num_hidden_layers: 8
  num_attention_heads: 16
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: False

Data:
  Train:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw
  Eval:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
  dp_degree: 2
  mp_degree: 2
  pp_degree: 2
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_6.7B_sharding16.yaml
================================================
_base_: ./pretrain_ernie_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 512
  micro_batch_size: 1


Model:
  vocab_size: 40000
  hidden_size: 4096
  num_hidden_layers: 32
  num_attention_heads: 32
  intermediate_size: 
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 4
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: True
  use_recompute: True

Data:
  Train:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw
  Eval:
    dataset:
      tokenizer_type: ernie-1.0-base-zh-cw


Distributed:
  dp_degree: 1
  mp_degree: 8
  pp_degree: 16
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml
================================================
_base_: ./pretrain_ernie_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 18000
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  intermediate_size: 3072
  hidden_act: "relu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  pad_token_id: 0
  use_recompute: False


Data:
  Train:
    dataset:
      tokenizer_type: ernie-1.0-large-zh-cw
  Eval:
    dataset:
      tokenizer_type: ernie-1.0-large-zh-cw


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False


================================================
FILE: ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml
================================================
Global:
  device: gpu
  seed: 1024
  binary_head: True
  
  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps: 1
  logging_freq: 1
  eval_freq: 500000
  eval_iters: 10
  test_iters: -1
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 50000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "ErnieModule"
  name: "Ernie"
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  intermediate_size: 3072
  hidden_act: "gelu"
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  pad_token_id: 0
  task_type_vocab_size: 3
  task_id: 0
  use_task_id: False
  use_recompute: False  


Data:
  Train:
    dataset:
      name: ErnieDataset
      input_dir: ./data
      tokenizer_type: ernie-1.0-base-zh-cw
      split: [949, 50, 1]
      mode: Train
      max_seq_length: 512
      masked_lm_prob: 0.15
      short_seq_prob: 0.1
      seed: 1024
      share_folder: False
      favor_longer_ngram: False
      max_ngrams: 3

    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 0
      return_list: False
      collate_fn: 
        name: ErnieCollateData
        micro_batch_size: 
  
  Eval:
    dataset:
      name: ErnieDataset
      input_dir: ./data
      tokenizer_type: ernie-1.0-base-zh-cw
      split: [949, 50, 1]
      mode: Eval
      max_seq_length: 512
      masked_lm_prob: 0.15
      short_seq_prob: 0.1
      seed: 1024
      share_folder: False
      favor_longer_ngram: False
      max_ngrams: 3

    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True

    loader:
      num_workers: 1
      return_list: False
      collate_fn: 
        name: ErnieCollateData
        micro_batch_size: 1

Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 990000
    warmup_rate: 0.01
    max_lr: 0.0001
    min_lr: 0.00001
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Inference:
  model_dir: ./output
  mp_degree: 1

Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/export_gpt_fp16_single_card.yaml
================================================
Global:
  device: gpu
  seed: 1024
  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1

Engine:
  max_steps: -1
  num_train_epochs: -1
  eval_freq: -1
  eval_iters: -1
  test_iters: -1
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False
  save_load:
    output_dir:
    ckpt_dir:

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_175B_mp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 1
  top_p: 0.9
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 8
  use_topp_sampling: True
  num_return_sequences: 1
  decode_strategy: "sampling"
  use_topp_sampling: True
  early_finish: True


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 51200
  hidden_size: 12288
  num_layers: 96
  num_attention_heads: 96
  ffn_hidden_size: 49152
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 1
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 8
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_mp2.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
  use_topp_sampling: True
  early_finish: True


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 2
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 0
  top_p: 0.9
  use_topp_sampling: True
  inference: True
  temperature: 1.0
  min_dec_len: 8
  max_dec_len: 8
  num_return_sequences: 1
  decode_strategy: "sampling"
  early_finish: True


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 0
  top_p: 0.9
  use_topp_sampling: True
  inference: True
  temperature: 1.0
  min_dec_len: 8
  max_dec_len: 8
  num_return_sequences: 1
  decode_strategy: "sampling"
  early_finish: True


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 51200
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size: 16384
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  fuse_attn_qkv: True
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:


Distributed:
  dp_degree: 8
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8_tuning.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  fuse_attn_qkv: True
  use_recompute: True
  recompute_granularity: "full_attn"
  no_recompute_layers:


Distributed:
  dp_degree: 8
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


Tuning:
  enable: True
  tuning_recompute: True
  profile_start_step: 1
  profile_end_step: 5


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  fuse_attn_qkv: True
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  fuse_attn_qkv: True
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:


Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 16
    sharding_stage: 2


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  eval_freq: 1
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False
  save_load:
    output_dir: ./output
    ckpt_dir:


Model:
  module: "GPTModuleAuto"
  name: "GPT"
  fuse_attn_qkv: False


Data:
  Train:
    collate_fn: gpt_collate_fn
    sample_split: 2
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024

  Eval:
    collate_fn: gpt_collate_fn
    sample_split: 2
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024


Optimizer:
  name: AdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 360000
    warmup_rate: 0.01
    max_lr: 5.0e-5
    min_lr: 1.0e-5
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


================================================
FILE: ppfleetx/configs/nlp/gpt/auto/qat_generation_gpt_345M_mp2.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 2
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


Quantization:
  enable: True
  channel_wise_abs_max: False
  weight_bits: 8
  activation_bits: 8
  onnx_format: True


================================================
FILE: ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml


Model:
  module: GPTEvalModule


Offline_Eval:
  eval_path: ./wikitext-103/wiki.valid.tokens
  cloze_eval: False
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml


Engine:
  save_load:
    ckpt_dir:


Model:
  module: GPTEvalModule
  hidden_dropout_prob: 0.0
  attention_probs_dropout_prob: 0.0


Compress:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125


Offline_Eval:
  eval_path: ./lambada_test.jsonl
  cloze_eval: True
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml


Model:
  module: GPTEvalModule

Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
    skip_tensor_map: 
      block_3: ['linear2']
      block_5: ['linear1']
      block_6: ['linear2']
      block_7: ['linear2']
      block_10: ['linear2']
      block_20: ['linear2']
      block_21: ['linear2']

Offline_Eval:
  eval_path: ./wikitext-103/wiki.valid.tokens
  cloze_eval: False
  overlapping_eval: 32
  batch_size: 8
  max_seq_len: 1024
  logging_freq: 10


================================================
FILE: ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml
================================================
_base_: ./finetune_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 32
  micro_batch_size: 32
  

Engine:
  run_mode: epoch
  num_train_epochs: 3
  accumulate_steps:
  logging_freq: 10
  eval_freq: 1
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "reduce_mean"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "GPTFinetuneModule"
  name: "GPT"
  num_classes: 2
  pretrained: './ckpt/PaddleFleetX_GPT_345M_220826/model'
  fuse_attn_qkv: True
  fused_linear: False
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  
  loss:
    train:
      name: 'CrossEntropyLoss'
    eval:
      name: 'CrossEntropyLoss'
  
  metric:
    eval:
      name: 'Accuracy'
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False
    
Optimizer:
  name: FusedAdamW
  weight_decay: 0.0
  beta1: 0.9
  beta2: 0.999
  epsilon: 1e-6
  multi_precision: True
  lr:
    name: LinearDecayWithWarmup
    warmup: 0.1
    learning_rate: 2e-5
  tensor_fusion: False
    
    
Data:
  Train:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'train'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: True
      drop_last: True
    loader:
      num_workers: 4
      return_list: False
  
  Eval:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'dev'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: False
      drop_last: False
    loader:
      num_workers: 4
      return_list: False


================================================
FILE: ppfleetx/configs/nlp/gpt/finetune_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 42

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1
  
Engine:
  run_mode: epoch
  max_steps: -1
  eval_freq: 1
  eval_iters: -1
  test_iters: -1
  save_load:
    save_steps: -1
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False

Model:
  use_flash_attn: False


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_gpt_345M_dp8.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml

Model:
  module: GPTGenerationModule

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"

Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_gpt_345M_mp1.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    level:


Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml

Model:
  module: GPTGenerationModule

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_gpt_6.7B_single_mp1.yaml
================================================
_base_: ./pretrain_gpt_base.yaml


Engine:
  mix_precision:
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
    use_fp16_guard: False


Generation:
  top_k: 0
  top_p: 0.9
  use_topp_sampling: True
  inference: True
  temperature: 1.0
  min_dec_len: 8
  max_dec_len: 8
  num_return_sequences: 1
  decode_strategy: "sampling"


Model:
  module: GPTGenerationModuleAuto
  vocab_size: 51200
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size: 16384
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  fuse_attn_qkv: True


Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml

Model:
  module: GPTGenerationModule

Compress:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_345M_single_card.yaml

Model:
  module: GPTGenerationModule

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
  use_topp_sampling: True
  inference: True

Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: ppfleetx/configs/nlp/gpt/generation_qat_gpt_6.7B_single_card.yaml
================================================
_base_: ./pretrain_gpt_6.7B_single_card.yaml

Model:
  module: GPTGenerationModule

Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
  use_topp_sampling: True
  inference: True

Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: ppfleetx/configs/nlp/gpt/inference_gpt_345M_dp8.yaml
================================================
_base_: ./generation_gpt_345M_dp8.yaml


Inference:
  model_dir: ./output
  mp_degree: 1


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Data:
  Test:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


================================================
FILE: ppfleetx/configs/nlp/gpt/inference_gpt_345M_single_card.yaml
================================================
_base_: ./generation_gpt_345M_single_card.yaml


Inference:
  model_dir: ./output
  mp_degree: 1


Distributed:
  dp_degree: 
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Data:
  Test:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 8
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_13B_dp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  seed: 1234

  global_batch_size: 480
  local_batch_size: 
  micro_batch_size: 4


Engine:
  max_steps: 200000
  eval_freq: 1000
  eval_iters: 10
  save_load:
    save_steps: 500


Model:
  vocab_size: 50432
  hidden_size: 5120
  num_layers: 40
  num_attention_heads: 40
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 4096
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity: 'full'
  no_recompute_layers:


Data:
  Train:
    dataset:
      max_seq_len: 4096
  
  Eval:
    dataset:
      max_seq_len: 4096


Distributed:
  dp_degree:
  mp_degree: 2
  pp_degree: 8
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Optimizer:
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 160000
    warmup_rate: 0.001
    max_lr: 1.0e-4
    min_lr: 1.0e-5


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 1536
  micro_batch_size: 1


Model:
  vocab_size: 51200
  hidden_size: 12288
  num_layers: 96
  num_attention_heads: 96
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity: 'core_attn'
  no_recompute_layers:
  virtual_pp_degree: 1
  sequence_parallel: True
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 8
  pp_degree: 16
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Engine:
  logging_freq: 10


Model:
  vocab_size: 50304
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  fused_linear: True


Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 16
    sharding_stage: 2
    sharding_offload: False
    reduce_overlap: True
    broadcast_overlap: True


Optimizer:
  tensor_fusion: True


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size: 16384
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "O2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:


Model:
  module: "GPTModule"
  name: "GPT"
  vocab_size_divisible_unit: 128
  fused_linear: False
  fuse_attn_qkv: True
  scale_qk_by_layer_num: True
  sequence_parallel: False
  use_flash_attn: False
  fused_softmax_with_triangular: True


Data:
  Train:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [969, 30, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn
  
  Eval:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [969, 30, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 360000
    warmup_rate: 0.01
    max_lr: 5.0e-5
    min_lr: 1.0e-5
    use_increments: True
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Distributed:
  fuse_sequence_parallel_allreduce: False


================================================
FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_cn_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  name: "GPT-cn"
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8

Engine:
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 4096
  hidden_dropout_prob: 0.0
  attention_probs_dropout_prob: 0.0
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    comm_overlap: False


Optimizer:
  weight_decay: 0.0
  lr:
    decay_steps: 90000
    warmup_rate: 0.00
    max_lr: 2.5e-5
    min_lr: 5.0e-6
    

Compress:
  pretrained:
  Prune:
    enable: True
    criterion: l1_norm
    ratio: 0.125


================================================
FILE: ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 1


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 8
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
    freeze_embedding: True
    skip_tensor_map: 
      block_3: ['linear2']
      block_5: ['linear1']
      block_6: ['linear2']
      block_7: ['linear2']
      block_10: ['linear2']
      block_20: ['linear2']
      block_21: ['linear2']


================================================
FILE: ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 8
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 1024
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: False
  recompute_granularity:
  fused_linear: True
  

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True
    freeze_embedding: True
    skip_tensor_map: 
      block_3: ['linear2']
      block_5: ['linear1']
      block_6: ['linear2']
      block_7: ['linear2']
      block_10: ['linear2']
      block_20: ['linear2']
      block_21: ['linear2']


================================================
FILE: ppfleetx/configs/nlp/gpt/qat_gpt_6.7B_sharding16.yaml
================================================
_base_: ./pretrain_gpt_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Engine:
  logging_freq: 10


Model:
  vocab_size: 50304
  hidden_size: 4096
  num_layers: 32
  num_attention_heads: 32
  ffn_hidden_size:
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  fused_linear: True


Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 16
    sharding_stage: 2
    sharding_offload: False
    reduce_overlap: True
    broadcast_overlap: True


Optimizer:
  tensor_fusion: True


Compress:
  pretrained:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
    onnx_format: True


================================================
FILE: ppfleetx/configs/nlp/moe/pretrain_moe_1.3B_dp8.yaml
================================================
_base_: ./pretrain_moe_base.yaml

Global:
  global_batch_size: 
  local_batch_size: 8
  micro_batch_size: 8


Model:
  vocab_size: 50304
  hidden_size: 2048
  num_layers: 24
  num_attention_heads: 16
  ffn_hidden_size: 
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 1024
  type_vocab_size: 16
  initializer_range: 0.02
  use_recompute: True
  recompute_granularity:
  no_recompute_layers:
  

Distributed:
  dp_degree: 8
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False


================================================
FILE: ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml
================================================
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    save_steps: 1000
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:
  balance_loss_weight: 1.0


Model:
  module: "MoEModule"
  name: "MoE"
  fused_linear: False
  fuse_attn_qkv: True
  sequence_parallel: False
  moe_configs:
    expert_mode: True
    gate: gshard
    top_k: 2
    num_experts: 2


Data:
  Train:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn
  
  Eval:
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [949, 50, 1]
      max_seq_len: 1024
    sampler:
      name: GPTBatchSampler
      shuffle: False
      drop_last: True
    loader:
      num_workers: 1
      return_list: False
      collate_fn: gpt_collate_fn


Optimizer:
  name: FusedAdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 360000
    warmup_rate: 0.01
    max_lr: 5.0e-5
    min_lr: 1.0e-5
  grad_clip:
    name: "ClipGradForMOEByGlobalNorm"
    clip_norm: 1.0
  tensor_fusion: False


Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False

Distributed:
  dp_degree: 1
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
  hcg: HybridCommGroupForMoE


================================================
FILE: ppfleetx/configs/vis/base.yaml
================================================
Global:
  device: gpu
  seed: 2021
  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1
  flags:
      FLAGS_enable_cublas_tensor_op_math: True
      FLAGS_gemm_use_half_precision_compute_type: False

Engine:
  run_mode: epoch
  max_steps: -1
  eval_freq: 1
  eval_iters: -1
  test_iters: -1
  save_load:
    save_steps: -1
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False
    
Model:
  use_recompute: False

Fused:
  tensor_fusion: False

Profiler:
  enable: False
  scheduler: [1, 5]
  profiler_log: profiler_log
  detailed: False


Inference:
  model_dir: ./output
  mp_degree: 1


================================================
FILE: ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2022

Engine:
  run_mode: 'epoch'
  num_train_epochs: 100
  eval_freq: 1
  eval_iters: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "MOCOClsModule"
  model:
    base_encoder:
      name: "resnet50"
      with_pool: False
      num_classes: 0 # remove last classifier   
      #pretrained: ./pretrained/mocov1/model
      pretrained: ./pretrained/mocov2/model
    base_classifier:
      name: "MoCoClassifier"
      with_pool: True
      num_features: 2048
      num_classes: 1000

  loss:
    train:
      name: 'CELoss'
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]


Optimizer:
  name: Momentum
  momentum: 0.9
  weight_decay: 0.0
  lr:
    name: MultiStepDecay
    run_mode: epoch
    learning_rate: 30.0
    gamma: 0.1
    milestones: [60, 80]


Data:
  Train:
    dataset:
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      class_num: 1000
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 224
            interpolation: bilinear
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total bachsize 256
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            resize_short: 256
            interpolation: bilinear
            backend: pil
        - CenterCropImage:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2022

Engine:
  run_mode: 'epoch'
  num_train_epochs: 200
  eval_freq: -1
  eval_iters: 0
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "MOCOModule"
  model:
    base_encoder:
      name: "resnet50"
      with_pool: False
      num_classes: 0 # remove last classifier
    base_classifier:
      name: "MoCoClassifier"
      with_pool: True
      num_features: 2048
      num_classes: 128
    momentum_encoder:
      name: "resnet50"
      with_pool: False
      num_classes: 0 # remove last classifier
    momentum_classifier:
      name: "MoCoClassifier"
      with_pool: True
      num_features: 2048
      num_classes: 128
  loss:
    train:
      name: 'CELoss'


Optimizer:
  name: Momentum
  momentum: 0.9
  weight_decay: 0.0001
  lr:
    name: MultiStepDecay
    run_mode: epoch
    learning_rate: 0.03
    gamma: 0.1
    milestones: [120, 160]


Data:
  Train:
    dataset:
      name: ContrativeLearningDataset
      root: ./dataset/ILSVRC2012/train
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 224
            scale: [0.2, 1.0]
            interpolation: bicubic
            backend: pil
        - RandomGrayscale:
            p: 0.2
        - ColorJitter:
            brightness: 0.4
            contrast: 0.4
            saturation: 0.4
            hue: 0.4
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total batchsize 256
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2022

Engine:
  run_mode: 'epoch'
  num_train_epochs: 200
  eval_freq: -1
  eval_iters: 0
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "MOCOModule"
  model:
    T: 0.2
    base_encoder:
      name: "resnet50"
      with_pool: False
      num_classes: 0 # remove last classifier
    base_projector:
      name: "MoCoV2Projector"
      in_dim: 2048
      out_dim: 2048
      with_pool: True
    base_classifier:
      name: "MoCoClassifier"
      with_pool: False
      num_features: 2048
      num_classes: 128
    momentum_encoder:
      name: "resnet50"
      with_pool: False
      num_classes: 0 # remove last classifier
    momentum_projector:
      name: "MoCoV2Projector"
      in_dim: 2048
      out_dim: 2048
      with_pool: True
    momentum_classifier:
      name: "MoCoClassifier"
      with_pool: False
      num_features: 2048
      num_classes: 128
  loss:
    train:
      name: 'CELoss'

Optimizer:
  name: Momentum
  momentum: 0.9
  weight_decay: 0.0001
  lr:
    name: CosineDecay
    run_mode: epoch
    update_unit: epoch
    learning_rate: 0.03

Data:
  Train:
    dataset:
      name: ContrativeLearningDataset
      root: ./dataset/ILSVRC2012/train
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 224
            scale: [0.2, 1.0]
            interpolation: bicubic
            backend: pil
        - ColorJitter:
            brightness: 0.4
            contrast: 0.4
            saturation: 0.4
            hue: 0.1
            p: 0.8
        - RandomGrayscale:
            p: 0.2
        - GaussianBlur:
            sigma: [.1, 2.]
            p: 0.5
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total batchsize 256
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml
================================================
Global:
  device: gpu
  seed: 2021
  global_batch_size:
  local_batch_size: 1
  micro_batch_size: 1

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1
    sharding_offload: False
    reduce_overlap: False
    broadcast_overlap: False

Engine:
  run_mode: 'epoch'
  num_train_epochs: 300
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir: ./ckpt

Model:
  use_recompute: False
  module: "GeneralClsModule"
  model:
    name: "ViT_base_patch16_224"
    class_num: 1000
    drop_rate: 0.1
  loss:
    train:
      name: 'ViTCELoss'
      epsilon: 0.0001
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: AdamW
  weight_decay: 0.3
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: ViTLRScheduler
    learning_rate: 0.003
    decay_type: cosine
    warmup_steps: 10000
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0

Inference:
  model_dir: ./output
  mp_degree: 1

  TensorRT:
    max_batch_size: 1
    workspace_size: 1<<30
    min_subgraph_size: 3
    precision: fp16
    use_static: False
    use_calib_mode: False
    collect_shape: False
    shape_range_info_filename: ./shape.pbtxt


================================================
FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 300
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir: 

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_base_patch16_224"
    class_num: 1000
    drop_rate: 0.1
  loss:
    train:
      name: 'ViTCELoss'
      epsilon: 0.0001
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: AdamW
  weight_decay: 0.3
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: ViTLRScheduler
    learning_rate: 0.003
    decay_type: cosine
    warmup_steps: 10000
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0

Data:
  Train:
    dataset:
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      class_num: 1000
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 224
            scale: [0.05, 1.0]
            interpolation: bicubic
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            resize_short: 256
            interpolation: bicubic
            backend: pil
        - CenterCropImage:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 8
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_base_patch16_384"
    class_num: 1000
    drop_rate: 0.1
    pretrained:
      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224
      finetune: True
  loss:
    train:
      name: 'CELoss'
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: Momentum
  weight_decay: 0.0001
  momentum: 0.9
  lr:
    name: ViTLRScheduler
    learning_rate: 0.004
    decay_type: cosine
    warmup_steps: 500
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 0.35


Data:
  Train:
    dataset:
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      class_num: 1000
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 384
            scale: [0.05, 1.0]
            interpolation: bilinear
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total batchsize 512
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            size: 384
            interpolation: bilinear
            backend: pil
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 103
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: False
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_base_patch16_384"
    class_num: 10
    drop_rate: 0.1
    pretrained:
      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224
      finetune: True
  loss:
    train:
      name: 'CELoss'
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: Momentum
  weight_decay: 0.0001
  momentum: 0.9
  lr:
    name: ViTLRScheduler
    learning_rate: 0.004
    decay_type: cosine
    warmup_steps: 500
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 0.35


Data:
  Train:
    dataset:
      name: CIFAR10
      root: ./dataset/cifar-10-batches-py/
      mode: train
      transform_ops:
        - RandCropImage:
            size: 384
            scale: [0.05, 1.0]
            interpolation: bilinear
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 64 # total batchsize 512
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: CIFAR10
      root: ./dataset/cifar-10-batches-py/
      mode: test
      transform_ops:
        - ResizeImage:
            size: 384
            interpolation: bilinear
            backend: pil
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True

Compress:
  Quantization:
    enable: True
    weight_quantize_type: 'abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    onnx_format: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 8
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_base_patch16_384"
    class_num: 1000
    drop_rate: 0.1
    pretrained:
      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-384
      finetune: True
  loss:
    train:
      name: 'CELoss'
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: Momentum
  weight_decay: 0.0001
  momentum: 0.9
  lr:
    name: ViTLRScheduler
    learning_rate: 0.004
    decay_type: cosine
    warmup_steps: 500
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 0.35


Data:
  Train:
    dataset:
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      class_num: 1000
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 384
            scale: [0.05, 1.0]
            interpolation: bilinear
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total batchsize 512
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            size: 384
            interpolation: bilinear
            backend: pil
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


Compress:
  Quantization:
    enable: True
    weight_quantize_type: 'channel_wise_abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    onnx_format: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 8
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_large_patch16_384"
    class_num: 1000
    drop_rate: 0.1
    pretrained:
      prefix_path: ./pretrained/vit/imagenet21k-ViT-L_16
      finetune: True
  loss:
    train:
      name: 'CELoss'
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: Momentum
  weight_decay: 0.0001
  momentum: 0.9
  lr:
    name: ViTLRScheduler
    learning_rate: 0.03
    decay_type: cosine
    warmup_steps: 500
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


Data:
  Train:
    dataset:
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      class_num: 1000
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 384
            scale: [0.05, 1.0]
            interpolation: bilinear
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total batchsize 512
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            size: 384
            interpolation: bilinear
            backend: pil
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 8
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_large_patch16_384"
    class_num: 1000
    drop_rate: 0.1
    pretrained:
      prefix_path: ./pretrained/vit/imagenet21k-ViT-L_16
      finetune: True
  loss:
    train:
      name: 'CELoss'
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: Momentum
  weight_decay: 0.0001
  momentum: 0.9
  lr:
    name: ViTLRScheduler
    learning_rate: 0.03
    decay_type: cosine
    warmup_steps: 500
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


Data:
  Train:
    dataset:
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      class_num: 1000
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 384
            scale: [0.05, 1.0]
            interpolation: bilinear
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 32 # total batchsize 512
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: GeneralClsDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            size: 384
            interpolation: bilinear
            backend: pil
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


Compress:
  Quantization:
    enable: True
    weight_quantize_type: 'channel_wise_abs_max'
    activation_quantize_type: 'moving_average_abs_max'
    activation_preprocess_type: 'PACT'
    weight_bits: 8
    activation_bits: 8
    onnx_format: True


================================================
FILE: ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml
================================================
_base_: ../base.yaml

Global:
  device: gpu
  seed: 2021

Engine:
  run_mode: 'epoch'
  num_train_epochs: 1
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    enable: True
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModule"
  model:
    name: "ViT_tiny_patch16_224"
    class_num: 10
    drop_rate: 0.1
  loss:
    train:
      name: 'ViTCELoss'
      epsilon: 0.0001
    eval:
      name: 'CELoss'
  metric:
    train:
      name: 'TopkAcc'
      topk: [1, 5]
    eval:
      name: 'TopkAcc'
      topk: [1, 5]

Optimizer:
  name: AdamW
  weight_decay: 0.3
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: ViTLRScheduler
    learning_rate: 0.003
    decay_type: cosine
    warmup_steps: 10000
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0

Data:
  Train:
    dataset:
      name: CIFAR10
      root: ./dataset/cifar-10-batches-py/
      mode: train
      transform_ops:
        - RandCropImage:
            size: 224
            scale: [0.05, 1.0]
            interpolation: bicubic
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: True
      shuffle: True
    loader:
      num_workers: 8
      use_shared_memory: True

  Eval:
    dataset: 
      name: CIFAR10
      root: ./dataset/cifar-10-batches-py/
      mode: test
      transform_ops:
        - ResizeImage:
            resize_short: 256
            interpolation: bicubic
            backend: pil
        - CenterCropImage:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:
        
    sampler:
      name: DistributedBatchSampler
      batch_size: 256
      drop_last: False
      shuffle: False
    loader:
      num_workers: 8
      use_shared_memory: True


================================================
FILE: ppfleetx/configs/vis/vit/auto/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml
================================================
_base_: ./base.yaml

Global:
  device: gpu
  seed: 2021
  local_batch_size: 256
  micro_batch_size: 256

Engine:
  num_train_epochs: 1
  eval_freq: 1
  accumulate_steps: 1
  logging_freq: 10
  mix_precision:
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "elementwise_div"]
    custom_white_list: []
  save_load:
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:

Model:
  module: "GeneralClsModuleAuto"
  model:
    name: "ViT_tiny_patch16_224"
    class_num: 10
    drop_rate: 0.1
  loss:
    name: 'ViTCELoss'
  metric:
    name: 'TopkAcc'
    topk: [1, 5]

Optimizer:
  name: AdamW
  weight_decay: 0.3
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: ViTLRScheduler
    learning_rate: 0.003
    decay_type: cosine
    warmup_steps: 10000
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0

Data:
  Train:
    sample_split: 1
    dataset:
      name: CIFAR10
      root: ./dataset/cifar-10-batches-py/
      mode: train
      transform_ops:
        - RandCropImage:
            size: 224
            scale: [0.05, 1.0]
            interpolation: bicubic
            backend: pil
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:

  Eval:
    sample_split: 1
    dataset: 
      name: CIFAR10
      root: ./dataset/cifar-10-batches-py/
      mode: test
      transform_ops:
        - ResizeImage:
            resize_short: 256
            interpolation: bicubic
            backend: pil
        - CenterCropImage:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
        - ToCHWImage:


================================================
FILE: ppfleetx/configs/vis/vit/auto/base.yaml
================================================
Global:
  device: gpu
  seed: 2021
  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1

Engine:
  run_mode: epoch
  max_steps: -1
  eval_freq: 1
  eval_iters: -1
  test_iters: -1
  save_load:
    save_steps: -1
    save_epoch: 1
    output_dir: ./output
    ckpt_dir:

Distributed:
  dp_degree:
  mp_degree: 1
  pp_degree: 1
  sharding:
    sharding_degree: 1
    sharding_stage: 1


Model:
  use_recompute: False


================================================
FILE: ppfleetx/core/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .engine import *
from .module import *


================================================
FILE: ppfleetx/core/engine/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .basic_engine import BasicEngine
from .inference_engine import InferenceEngine, TensorRTConfig
from .eager_engine import EagerEngine
from .auto_engine import AutoEngine


================================================
FILE: ppfleetx/core/engine/auto_engine.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import sys
import logging
import numpy as np

import paddle
import paddle.nn as nn
import paddle.distributed as dist
import paddle.fluid.core as core
from paddle.distributed.fleet import auto
from paddle.optimizer.lr import LRScheduler

from ppfleetx.utils.log import logger
from ppfleetx.core.engine import BasicEngine
from ppfleetx.core.module import BasicModule
from ppfleetx.utils.version import version_check
from ppfleetx.data import utils
from ppfleetx.optims import build_lr_scheduler, build_optimizer

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class AutoEngine(BasicEngine):
    def __init__(self, configs, module=None, mode='train'):
        super().__init__()
        version_check()

        model = None
        loss_fn = None

        if module and not isinstance(module, BasicModule):
            raise TypeError(
                "'module' must be sub classes of `BasicModule`, but got: {model.__class__.__name__}."
            )

        if module:
            if module.model and not isinstance(
                    module.model, nn.Layer) and not callable(module.model):
                raise TypeError(
                    "'model' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.model.__class__.__name__}."
                )
            model = module.model

            if mode == 'train':
                if module.loss_fn and not isinstance(
                        module.loss_fn,
                        nn.Layer) and not callable(module.loss_fn):
                    raise TypeError(
                        "'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}."
                    )
            else:
                module.loss_fn = None
                module.model.eval()
            loss_fn = module.loss_fn

        self._module = module

        # lr_scheduler and optimizer
        lr = build_lr_scheduler(
            configs.Optimizer.lr) if mode == "train" else None
        optimizer = build_optimizer(configs.Optimizer, model,
                                    lr) if mode == "train" else None

        # engine configs
        self._configs = configs['Engine']
        self._max_steps = self._configs['max_steps']
        self._verbose = self._configs["verbose"]
        self._eval_freq = self._configs['eval_freq']
        self._eval_iters = self._configs['eval_iters']
        self._test_iters = self._configs['test_iters']
        self._logging_freq = self._configs['logging_freq']
        self._num_train_epochs = self._configs['num_train_epochs']
        self._strategy = self._configs['strategy']

        # save & load
        self._save_steps = self._configs['save_load']['save_steps']
        self._save_epoch = self._configs['save_load']['save_epoch']
        self._output_dir = self._configs['save_load']['output_dir']
        self._ckpt_dir = self._configs['save_load']['ckpt_dir']

        # engine fit inputs
        self.batch_size = configs['Global']['global_batch_size']

        # init engine
        self._auto_engine = auto.Engine(
            model, loss_fn, optimizer, strategy=self._strategy)

    def fit(self, epoch=1, train_dataset=None, valid_dataset=None):

        train_sample_split = train_dataset.sample_split if train_dataset else None
        valid_sample_split = valid_dataset.sample_split if valid_dataset else None

        self._auto_engine.fit(train_data=train_dataset,
                              valid_data=valid_dataset,
                              train_sample_split=train_sample_split,
                              valid_sample_split=valid_sample_split,
                              epochs=self._num_train_epochs,
                              batch_size=self.batch_size,
                              steps_per_epoch=self._max_steps,
                              valid_steps=self._eval_iters,
                              valid_freq=self._eval_freq,
                              collate_fn=train_dataset.collate_fn,
                              log_freq=self._logging_freq,
                              save_dir=self._output_dir,
                              save_freq=self._save_steps,
                              verbose=self._verbose)

    def evaluate(self, valid_dataset=None):

        self._auto_engine.evaluate(
            valid_data=valid_dataset,
            valid_sample_split=valid_dataset.sample_split,
            batch_size=self.batch_size,
            steps=self._max_steps,
            collate_fn=valid_dataset.collate_fn)

    def predict(self, test_dataset=None):

        self._auto_engine.predict(
            test_data=test_dataset,
            test_sample_split=test_dataset.sample_split,
            batch_size=self.batch_size,
            steps=self._max_steps,
            collate_fn=test_dataset.collate_fn)

    def export(self):
        self._auto_engine.prepare(self._module.input_spec(), mode="predict")
        self.save(training=False)

    def tune(self, tune_dataset=None):
        self._auto_engine._tune(
            tune_dataset,
            tune_sample_split=tune_dataset.sample_split,
            batch_size=self.batch_size)

    def save(self, training=True):
        if self._output_dir and isinstance(self._output_dir, str):
            path = os.path.join(self._output_dir, "auto")
            self._auto_engine.save(path, training=training)
        else:
            raise TypeError("`save` requires a valid value of `output_dir`.")

    def load(self):
        if self._ckpt_dir and isinstance(self._ckpt_dir, str):
            self._auto_engine.load(self._ckpt_dir)
        else:
            logger.warning("`load` requires a valid value of `ckpt_dir`.")

    def export_from_prog(self):
        paddle.enable_static()

        if not (self._ckpt_dir and isinstance(self._ckpt_dir, str)):
            raise ValueError("invalid ckpt_dir.")

        exe = paddle.static.Executor()

        [inference_program, feed_target_names,
         fetch_targets] = paddle.static.load_inference_model(
             path_prefix=self._ckpt_dir, executor=exe)
        feed_targets = [
            inference_program.global_block().var(name)
            for name in feed_target_names
        ]

        self._auto_engine.prepare(
            inputs=feed_targets,
            main_program=inference_program,
            startup_program=paddle.static.Program(),
            mode="predict")

        model_dict = self._auto_engine.main_program.state_dict()
        for param in list(
                filter(lambda var: var.persistable,
                       self._auto_engine.main_program.list_vars())):
            if param.type in [
                    core.VarDesc.VarType.FEED_MINIBATCH,
                    core.VarDesc.VarType.FETCH_LIST
            ]:
                continue
            if param.dtype != model_dict[param.name]._dtype():
                model_dict[param.name] = model_dict[param.name]._as_type(
                    param.dtype)
        self._auto_engine.main_program.set_state_dict(model_dict)

        path = os.path.join(self._output_dir, "auto_dist0")
        paddle.static.save_inference_model(
            path,
            feed_targets,
            fetch_targets,
            exe,
            program=self._auto_engine.main_program, )

        paddle.disable_static()


================================================
FILE: ppfleetx/core/engine/basic_engine.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


class BasicEngine:
    """
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def fit(self, *args, **kwargs):
        raise NotImplementedError

    def evaluate(self, *args, **kwargs):
        raise NotImplementedError

    def predict(self, *args, **kwargs):
        raise NotImplementedError

    def save(self, *args, **kwargs):
        raise NotImplementedError

    def load(self, *args, **kwargs):
        raise NotImplementedError

    def inference(self, *args, **kwargs):
        raise NotImplementedError


================================================
FILE: ppfleetx/core/engine/eager_engine.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import sys
import logging
from tokenize import group

import paddle
import paddle.nn as nn
import paddle.distributed as dist
import paddle.distributed.fleet as fleet
from paddle.optimizer.lr import LRScheduler

from paddle.distributed.parallel import sync_params_buffers
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
from paddle.profiler import SummaryView
from paddle.distributed.fleet.meta_parallel import TensorParallel
from paddle.distributed.sharding import group_sharded_parallel

import paddleslim
from ppfleetx.distributed.apis import env, amp
from ppfleetx.optims import build_lr_scheduler, build_optimizer
from ppfleetx.utils.log import logger, get_timestamp, convert_timestamp_to_data
from ppfleetx.core.engine import BasicEngine, InferenceEngine, TensorRTConfig
from ppfleetx.core.module import BasicModule
from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters
from ppfleetx.utils.version import version_check
from ppfleetx.utils.export import export_inference_model
from paddle.incubate.distributed.utils.io import save_for_auto_inference
from ppfleetx.utils.device import synchronize as device_synchronize
from ppfleetx.utils.compression_helper import prune_model, quant_model


class EagerEngine(BasicEngine):
    """
    The common engine for all models that support single-card and distributed
    training, validation and test. Only used in eager dygraph mode.
    """

    def __init__(self, configs, module, optimizer=None, lr=None, mode='train'):
        """
        Initialize an engine depending on the user-defined module and configs.

        Args:

            module(BasicModule): user-defined module. After assigning computations
                and configurations of model/optimizers/lr Schedulers, engine can
                support the whole loop of training/validation/test.

            configs(dict): the configurations that engine needs for training/validation/test
                loop. Such as mix precision strategy, save&load and the infos of steps/epoches.

        Return:

            An instance of `EagerEngine`.

        Examples::

            class TestModule(BasicModule):

                def __init__(self):
                    super().__init__()
                    self.model = paddle.nn.Linear(28 * 28, 10)
                    self.loss_fn = paddle.nn.MSELoss()

                def forward(self, x):
                    return paddle.relu(self.model(x.reshape(-1)))

                def training_step(self, batch):
                    x, y = batch
                    loss = self.loss_fn(self(x), y)
                    return loss

                def configure_optimizers(self):
                    return paddle.optimizer.Adam(
                        parameters=self.model.parameters(), learning_rate=0.02)

            module = TestModule()
            engine = EagerEngine(module, configs)

        """
        super().__init__()
        version_check()

        self.mode = mode

        if not isinstance(module, BasicModule):
            raise TypeError(
                "'module' must be sub classes of `BasicModule`, but got: {model.__class__.__name__}."
            )

        self._module = module

        if module.model and not isinstance(
                module.model, nn.Layer) and not callable(module.model):
            raise TypeError(
                "'model' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.model.__class__.__name__}."
            )

        # if mode == 'train':
        #     if module.loss_fn and not isinstance(
        #             module.loss_fn, nn.Layer) and not callable(module.loss_fn):
        #         raise TypeError(
        #             "'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}."
        #         )

        # global configs
        self._global_batch_size = configs['Global']['global_batch_size']

        # engine configs
        self._configs = configs['Engine']

        self._run_mode = self._configs.get('run_mode', 'step')
        assert self._run_mode in ['epoch', 'step'
                                  ], 'run_mode must be epoch or step'
        self._max_steps = self._configs['max_steps']
        self._eval_freq = self._configs['eval_freq']
        self._eval_iters = self._configs['eval_iters']
        self._test_iters = self._configs['test_iters']
        self._logging_freq = self._configs['logging_freq']
        self._num_train_epochs = self._configs['num_train_epochs']
        self._accumulate_steps = self._configs['accumulate_steps']

        amp_config = self._configs['mix_precision']
        self._amp_enable = amp_config['enable']
        if mode == 'export' and self._amp_enable:
            logger.info("NOTE: disable mix_precision in export mode")
            self._amp_enable = False

        self._amp_dtype = amp_config.get('dtype', 'float16')
        self._amp_level = amp_config.get('level', 'O2')
        self._use_main_grad = amp_config.get('use_main_grad', False)
        self._scale_loss = amp_config['scale_loss']
        self._custom_black_list = amp_config['custom_black_list']
        self._custom_white_list = amp_config['custom_white_list']

        self._save_steps = self._configs['save_load']['save_steps']
        self._save_epoch = self._configs['save_load']['save_epoch']

        self._output_dir = self._configs['save_load']['output_dir']
        self._ckpt_dir = self._configs['save_load']['ckpt_dir']

        self._compress_configs = None
        self.prune_configs = None
        self.quant_configs = None
        self._quant_mode = False
        if 'Compress' in configs:
            self.mode = 'compress'
            self._compress_configs = configs['Compress']
            if "Prune" in self._compress_configs:
                self.prune_configs = self._compress_configs["Prune"]
            if "Quantization" in self._compress_configs:
                self.quant_configs = self._compress_configs["Quantization"]
                self._quant_mode = True
            self.compress_model()

        # TODO(haohongxiang): Remove there extra configs after reconstruct of Fleet API
        self._dist_configs = configs['Distributed']
        self._dp_degree = self._dist_configs['dp_degree']
        self._mp_degree = self._dist_configs['mp_degree']
        self._pp_degree = self._dist_configs['pp_degree']
        sharding_config = self._dist_configs['sharding']

        self._sharding_stage = sharding_config['sharding_stage']
        self._sharding_degree = sharding_config['sharding_degree']
        self._sharding_offload = sharding_config['sharding_offload']
        self._reduce_overlap = sharding_config['reduce_overlap']
        self._broadcast_overlap = sharding_config['broadcast_overlap']

        self._use_recompute = configs['Model']['use_recompute']

        if self._amp_enable:
            if mode == 'train' and self._amp_dtype == "float16":
                self._scaler = paddle.amp.GradScaler(
                    init_loss_scaling=self._scale_loss)
            else:  # bfloat16
                self._scaler = paddle.amp.GradScaler(
                    init_loss_scaling=1, use_dynamic_loss_scaling=False)

            # Save dtype is the same as model dtype. Also can set save_dtype='float32' when
            # training with pure fp16 strategy, but will cause the rise of memory.
            if self._amp_level == "O2":
                self._module.model = paddle.amp.decorate(
                    models=self._module.model,
                    dtype=self._amp_dtype,
                    level=self._amp_level)
        else:
            self._scaler = None

        if mode == 'train':
            self._use_increments = configs.Optimizer.lr.pop('use_increments',
                                                            False)
            self._lr_scheduler_mode = configs.Optimizer.lr.pop('run_mode',
                                                               'step')
            assert self._lr_scheduler_mode in [
                'epoch', 'step'
            ], 'lr.run_mode must be epoch or step'
        self._lr_scheduler = build_lr_scheduler(
            configs.Optimizer.lr) if mode == 'train' else None

        self._optimizer = build_optimizer(
            configs.Optimizer, self._module.model,
            self._lr_scheduler) if mode == 'train' else None

        if self._amp_enable and self._amp_dtype in [
                'float16', 'bfloat16'
        ] and self._amp_level == 'O2' and self._use_main_grad:
            self._module.model = amp.MixPrecisionLayer(
                self._module.model, dtype=self._amp_dtype)
            self._optimizer = amp.MixPrecisionOptimizer(self._optimizer)
            self._scaler = amp.MixPrecisionScaler(self._scaler)

        # distributed configs
        self._distributed = (dist.get_world_size() > 1)

        if self._distributed:
            self._hcg = env.get_hcg()
            self._dp_group = self._hcg.get_data_parallel_group()
            self._sharding_group = self._hcg.get_sharding_parallel_group()

            self._dp_rank = self._hcg.get_data_parallel_rank()
            self._mp_rank = self._hcg.get_model_parallel_rank()
            self._pp_rank = self._hcg.get_stage_id()
            self._sharding_rank = self._hcg.get_sharding_parallel_rank()

            self._wrap_with_fleet()
        else:
            self._dp_rank = 0

        # using for save/load
        self._load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1}

        if 'Inference' in configs:
            self._inference_configs = configs['Inference']
            self._inference_engine = None

        self.profiler = None
        if 'Profiler' in configs and configs.get('Profiler', {}).get('enable',
                                                                     False):
            self.profiler_config = configs['Profiler']

            scheduler = self.profiler_config.get('scheduler', None)
            profiler_log = self.profiler_config.get('profiler_log',
                                                    './profiler_log')
            record_shapes = self.profiler_config.get('record_shapes', True)
            profile_memory = self.profiler_config.get('profile_memory', True)
            self.profiler = paddle.profiler.Profiler(
                targets=[
                    paddle.profiler.ProfilerTarget.CPU,
                    paddle.profiler.ProfilerTarget.GPU
                ],
                scheduler=scheduler,
                on_trace_ready=paddle.profiler.export_chrome_tracing(
                    profiler_log),
                record_shapes=record_shapes,
                profile_memory=profile_memory)
            self.profiler.start()
            logger.warning(
                "Profiler is enabled, do not enable it in production.")

    def _wrap_with_fleet(self):
        if self._sharding_stage in [2, 3]:
            assert self._pp_degree == 1, "sharding stage2/3 will support pipeline parallel later"
            self._wrap_sharding_2_3()
        else:
            self._wrap_3D_parallel()

    def _wrap_sharding_2_3(self):
        if self._dp_degree > 1 and self._sharding_stage == 3:
            sync_params_buffers(
                self._module.model,
                comm_group=self._dp_group,
                src_rank=self._dp_group.ranks[0])

        if self._mp_degree > 1:
            assert self._sharding_stage == 2, "only support mp + sharding stage2 hybrid parallel now."
            self._module.model = TensorParallel(
                self._module.model, self._hcg, strategy=None)

        level = "p_g_os" if self._sharding_stage == 3 else "os_g"
        origin_model = self._module.model
        self._module.model, self._optimizer, self._scaler = group_sharded_parallel(
            model=self._module.model,
            optimizer=self._optimizer,
            level=level,
            scaler=self._scaler,
            group=self._sharding_group,
            offload=self._sharding_offload,
            dp_group=self._dp_group if self._dp_group.nranks > 1 else None)
        if self._reduce_overlap:
            self._module.model._set_reduce_overlap(self._reduce_overlap)
        if self._broadcast_overlap:
            self._optimizer._set_broadcast_overlap(
                self._broadcast_overlap, layers=origin_model, num_groups=2)

    def _wrap_3D_parallel(self):
        if isinstance(self._module.model, amp.MixPrecisionLayer):
            if dist.get_world_size() == self._dp_degree:
                sync_params_buffers(
                    self._module.model,
                    comm_group=self._dp_group,
                    src_rank=self._dp_group.ranks[0])
            elif self._pp_degree > 1:
                self._module.model = fleet.distributed_model(
                    self._module.model._layers)
        else:
            self._module.model = fleet.distributed_model(self._module.model)
        self._optimizer = fleet.distributed_optimizer(self._optimizer)
        self._scaler = fleet.distributed_scaler(
            self._scaler) if self._scaler is not None else self._scaler

    def _train_one_epoch(self,
                         epoch_index,
                         train_data_loader=None,
                         valid_data_loader=None):
        self._module.model.train()

        # time count
        train_losses = []
        train_step_start = get_timestamp()
        skip_first = True
        # Note(GuoxiaWang): Do not use len(train_data_loader()),
        # it will cause a memory leak.
        total_train_batch = self._max_steps if self._run_mode == 'step' else len(
            train_data_loader)
        total_train_step = self._max_steps if self._run_mode == 'step' else total_train_batch * self._num_train_epochs
        total_eval_batch = len(
            valid_data_loader) if valid_data_loader is not None else 0
        valid_data_loader = valid_data_loader(
        ) if valid_data_loader is not None else None
        eval_finished_step = 0
        for step, batch in enumerate(train_data_loader()):

            if epoch_index == self._load_recovery['epoch']:
                if step < self._load_recovery['step']:
                    continue

            loss = self._fit_impl(batch)
            train_losses.append(loss)

            if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step':
                if self._scaler is None or self._scaler._found_inf == 0:
                    self._lr_scheduler.step(epoch=self._global_batch_size
                                            if self._use_increments else None)

            if (step + 1) % self._logging_freq == 0:
                train_step_cost = get_timestamp() - train_step_start
                numpy_losses = [float(loss) for loss in train_losses]
                log_dict = {
                    'epoch': epoch_index,
                    'total_epoch': self._num_train_epochs,
                    'batch': step,
                    'total_batch': total_train_batch,
                    'total_step': total_train_step,
                    'train_cost': train_step_cost
                    if step == 0 else train_step_cost / self._logging_freq,
                    'loss': sum(numpy_losses) / len(numpy_losses),
                    'lr': self._optimizer.get_lr(),
                    'found_inf': self._scaler._found_inf
                    if self._scaler is not None else 0,
                }
                if self._amp_enable:
                    log_dict['loss_scale'] = self._scaler._scale.numpy()[0]
                self._module.training_step_end(log_dict)

                train_step_start = get_timestamp()
                train_losses = []

            self._optimizer.clear_grad()

            if self._run_mode == 'step' and not skip_first:
                if self._eval_freq > 0 and step % self._eval_freq == 0:

                    eval_losses = []
                    eval_step_start = get_timestamp()

                    for eval_step, batch in enumerate(valid_data_loader):
                        eval_finished_step += 1
                        loss = self._evaluate_impl(batch)
                        eval_losses.append(loss)

                        if eval_step >= self._eval_iters - 1:
                            break

                    eval_step_cost = get_timestamp() - eval_step_start
                    eval_loss = sum(eval_losses) / len(eval_losses)

                    log_dict = {
                        'loss': float(eval_loss),
                        'epoch': epoch_index,
                        'batch': eval_finished_step,
                        'total_batch': total_eval_batch,
                        'eval_cost': eval_step_cost / self._logging_freq,
                    }
                    self._module.validation_step_end(log_dict)

                if self._save_steps > 0 and step % self._save_steps == 0:
                    device_synchronize()
                    self.save(epoch=epoch_index, step=step)
            else:
                skip_first = False

            if self._run_mode == 'step' and step >= self._max_steps:
                return

            if self.profiler:
                self.profiler.step()

    def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None):
        """
        Run the full process of training/validation/save loop.

        Args:

            epoch(int): the epoch index.

            train_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying training samples.

            valid_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying validation samples.

        """
        self._module.model.train()

        train_start = get_timestamp()

        start_epoch = self._load_recovery['epoch']
        if self._load_recovery['rng_state'] != -1:
            paddle.set_cuda_rng_state(self._load_recovery['rng_state'])

        for epoch_index in range(start_epoch, epoch):
            train_epoch_start = get_timestamp()
            self._train_one_epoch(epoch_index, train_data_loader,
                                  valid_data_loader)

            train_epoch_cost = get_timestamp() - train_epoch_start
            log_dict = {
                'epoch': epoch_index,
                'train_cost': train_epoch_cost,
            }
            self._module.training_epoch_end(log_dict)

            if self._lr_scheduler is not None and self._lr_scheduler_mode == 'epoch':
                self._lr_scheduler.step()

            if self._run_mode == 'epoch' and self._eval_freq > 0 and \
                epoch_index % self._eval_freq == 0:
                eval_epoch_start = get_timestamp()
                self._evaluate_one_epoch(epoch_index, valid_data_loader)
                eval_epoch_cost = get_timestamp() - eval_epoch_start
                log_dict = {
                    'epoch': epoch_index,
                    'eval_cost': eval_epoch_cost,
                }
                self._module.validation_epoch_end(log_dict)

            if self._save_epoch > 0 and self._run_mode == 'epoch' and epoch_index % self._save_epoch == 0:
                self.save(epoch=epoch_index, step=len(train_data_loader))

        logger.info(
            "The training process is complete and total cost of time for training is : {}".
            format(convert_timestamp_to_data(get_timestamp() - train_start)))

        if self.profiler:
            self._profiler_done()

    def _fit_impl(self, batch):
        self._module.model.train()

        batch = self._module.pretreating_batch(batch)
        if self._pp_degree == 1:
            if self._use_recompute and isinstance(self._module.model,
                                                  paddle.DataParallel):
                with self._module.model.no_sync():
                    loss = self._model_forward_backward(batch)
                if not hasattr(self._optimizer, "all_fused_tensors"
                               ) or self._optimizer.all_fused_tensors is None:
                    try:
                        fused_allreduce_gradients(
                            list(self._module.model.parameters()), None)
                    except:
                        m = self._module.model.state_dict()
                        fused_allreduce_gradients(
                            list(self._module.model.parameters()), None)
                else:
                    all_reduce_parameters(self._optimizer.all_fused_tensors,
                                          self._dp_group)
            elif isinstance(self._module.model, amp.MixPrecisionLayer) \
                and self._distributed and dist.get_world_size() == self._dp_degree:
                loss = self._model_forward_backward(batch)
                fused_allreduce_gradients(
                    list(self._module.model.parameters()), None)
            else:
                loss = self._model_forward_backward(batch)
        else:
            with paddle.amp.auto_cast(
                    enable=self._amp_enable,
                    custom_black_list=self._custom_black_list,
                    custom_white_list=self._custom_white_list,
                    dtype=self._amp_dtype,
                    level=self._amp_level):
                batch = self._module.model._prepare_training(
                    batch, self._optimizer, self._lr_scheduler)
                loss = self._module.model.forward_backward_pipeline(
                    batch, self._scaler)

        self._optim_update_params()
        return loss

    def _model_forward_backward(self, batch):
        if self._accumulate_steps == 1 or self._pp_degree > 1:
            batches = [batch]
        else:
            split_batches = [
                paddle.split(b, self._accumulate_steps) for b in batch
            ]
            batches = []
            for i in range(len(split_batches[0])):
                micro_batch = [split_batch[i] for split_batch in split_batches]
                batches.append(micro_batch)
        final_loss = None
        for micro_batch in batches:
            with paddle.amp.auto_cast(
                    self._amp_enable,
                    custom_black_list=self._custom_black_list,
                    custom_white_list=self._custom_white_list,
                    dtype=self._amp_dtype,
                    level=self._amp_level):
                loss = self._module.training_step(micro_batch)

            if self._amp_enable and self._amp_dtype == "float16":
                loss_bw = self._scaler.scale(loss)
            else:
                loss_bw = loss
            if self._accumulate_steps > 1:
                # div the loss for backward
                loss_bw = loss_bw / self._accumulate_steps

            self._module.backward(loss_bw)

            detach_loss = loss.detach()
            if final_loss is None:
                final_loss = detach_loss
            else:
                final_loss = paddle.add(final_loss, detach_loss)
        if self._accumulate_steps > 1:
            # div the loss for print
            final_loss = final_loss / self._accumulate_steps
        return final_loss

    def _optim_update_params(self):
        if self._sharding_stage in [3] and self._dp_degree > 1:
            fused_allreduce_gradients(self._module.model.parameters(),
                                      self._hcg)

            for p in self._module.model.parameters():
                if hasattr(p, "bw_storage"):
                    assert p.grad is None, "This case shouldn't happen."
                    p.bw_storage.scale_(1.0 / self._dp_group.nranks)
                    dist.all_reduce(p.bw_storage, group=self._dp_group)

        if self._amp_enable and self._amp_dtype == "float16":
            self._scaler.step(self._optimizer)
            self._scaler.update()
        else:
            self._optimizer.step()

    @paddle.no_grad()
    def evaluate(self, epoch=1, valid_data_loader=None):
        """
        run one evaluation epoch over the validation set.

        Args:

            epoch(int): the epoch index.

            valid_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying validation samples.

        """
        self._module.model.eval()

        for epoch_index in range(epoch):
            eval_epoch_start = get_timestamp()
            self._evaluate_one_epoch(epoch_index, valid_data_loader)

            eval_epoch_cost = get_timestamp() - eval_epoch_start
            log_dict = {
                'epoch': epoch_index,
                'eval_cost': eval_epoch_cost,
            }
            self._module.validation_epoch_end(log_dict)

        logger.info("The evaluting process is complete.")
        del valid_data_loader
        return

    @paddle.no_grad()
    def _evaluate_one_epoch(self, epoch=1, valid_data_loader=None):
        self._module.model.eval()

        eval_step_start = get_timestamp()
        eval_losses = []
        total_eval_batch = len(valid_data_loader)
        valid_data_loader = valid_data_loader(
        ) if valid_data_loader is not None else None
        for eval_step, batch in enumerate(valid_data_loader):
            loss = self._evaluate_impl(batch)
            eval_losses.append(float(loss))

            if eval_step % self._logging_freq == 0:
                eval_step_cost = get_timestamp() - eval_step_start
                log_dict = {
                    'loss': sum(eval_losses) / len(eval_losses),
                    'epoch': epoch,
                    'batch': eval_step,
                    'total_batch': total_eval_batch,
                    'eval_cost': eval_step_cost
                    if eval_step == 0 else eval_step_cost / self._logging_freq,
                }
                self._module.validation_step_end(log_dict)
                eval_step_start = get_timestamp()
                eval_losses = []

            if self._run_mode == 'step' and eval_step >= self._max_steps:
                logger.info("[eval] epoch {} : evaluting process is complete.".
                            format(epoch))
                return

    @paddle.no_grad()
    def _evaluate_impl(self, batch):
        self._module.model.eval()

        batch = self._module.pretreating_batch(batch)
        with paddle.amp.auto_cast(
                self._amp_enable,
                custom_black_list=self._custom_black_list,
                custom_white_list=self._custom_white_list,
                dtype=self._amp_dtype,
                level=self._amp_level):
            if self._pp_degree == 1:
                loss = self._module.validation_step(batch)
            else:
                loss = self._module.model.eval_batch(batch, compute_loss=True)

        return loss

    @paddle.no_grad()
    def predict(self, epoch=1, test_data_loader=None):
        """
        run one evaluation epoch over the test set.

        Args:

            epoch(int): the epoch index.

            test_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying test samples.

        """
        self._module.model.eval()

        test_start = get_timestamp()
        test_losses = []
        test_data_loader = test_data_loader()
        for test_step, batch in enumerate(test_data_loader):
            loss = self._predict_impl(batch)

            test_losses.append(float(loss))

            if test_step % self._logging_freq == 0:
                test_cost = get_timestamp() - test_start
                log_dict = {
                    'loss': sum(test_losses) / len(test_losses),
                    'epoch': epoch,
                    'batch': test_step,
                    'test_cost': test_cost
                    if test_step == 0 else test_cost / self._logging_freq,
                }
                self._module.test_step_end(log_dict)
                test_start = get_timestamp()
                test_losses = []

            if test_step >= self._max_steps:
                logger.info("The predicting process is complete.")
                del test_data_loader
                return

    @paddle.no_grad()
    def _predict_impl(self, batch):
        self._module.model.eval()
        batch = self._module.pretreating_batch(batch)

        with paddle.amp.auto_cast(
                self._amp_enable,
                custom_black_list=self._custom_black_list,
                custom_white_list=self._custom_white_list,
                dtype=self._amp_dtype,
                level=self._amp_level):
            if self._pp_degree == 1:
                loss = self._module.test_step(batch)
            else:
                loss = self._module.model.eval_batch(batch, compute_loss=True)

        return loss

    def save(self, epoch=0, step=0):
        """
        save the state dicts of model and optimizer into an checkpoint.
        """
        if self._dp_rank != 0:
            logger.info("DP_Rank %d doesn't save model" % self._dp_rank)
            return

        if self._output_dir and isinstance(self._output_dir, str):
            output_dir = os.path.join(self._output_dir,
                                      "epoch_%d_step_%d" % (epoch, step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir, exist_ok=True)
            logger.info("Save model to %s" % output_dir)

            save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format(
                output_dir, self._mp_rank, self._sharding_rank,
                self._pp_rank) if self._distributed else output_dir

            if self._sharding_stage == 3:
                self._module.model.get_all_parameters(convert2cpu=False)
            paddle.save(self._module.model.state_dict(),
                        os.path.join(save_dir, "model.pdparams"))
            paddle.save(self._optimizer.state_dict(),
                        os.path.join(save_dir, "model_state.pdopt"))

            meta_dict = {
                "epoch": epoch,
                "step": step,
                "cuda_rng_state": paddle.get_cuda_rng_state()
            }
            paddle.save(meta_dict, os.path.join(save_dir, "meta_state.pdopt"))

            save_auto_dir = os.path.join(output_dir, "auto_infer")
            save_for_auto_inference(
                os.path.join(save_auto_dir, "auto"), self._module.model)

        else:
            raise TypeError("`save` requires a valid value of `output_dir`.")

    def compress_model(self):
        if self._compress_configs is None: return
        self._distributed = (dist.get_world_size() > 1)
        # Load pretrained model before compression
        if 'pretrained' in self._compress_configs and self._compress_configs[
                'pretrained'] is not None:
            self._ckpt_dir = self._compress_configs['pretrained']
            self.load()
            # Avoid loading again
            self._configs['save_load']['ckpt_dir'] = None

        if self.prune_configs is not None and self.prune_configs.enable:
            prune_model(self._module.model, self.prune_configs,
                        self._module.input_spec())
        #NOTE(minghaoBD): We haven't fully tested Prune+Quantization, so an "else if" is put here for separation.
        elif self.quant_configs is not None and self.quant_configs.enable:
            self._module.model, self.quanter = quant_model(self._module.model,
                                                           self.quant_configs)

    def load(self):
        """
        load the saved checkpoint file and update the state dicts of model and optimizer.
        """
        if self._ckpt_dir and isinstance(self._ckpt_dir, str):
            logger.info("Try to load checkpoint from %s " % self._ckpt_dir)

            if self._quant_mode:
                load_dir = self._ckpt_dir
            else:
                load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format(
                    self._ckpt_dir, self._mp_rank, self._sharding_rank,
                    self._pp_rank) if self._distributed else self._ckpt_dir
            model_path = os.path.join(load_dir, "model.pdparams")
            opt_path = os.path.join(load_dir, "model_state.pdopt")
            meta_path = os.path.join(load_dir, "meta_state.pdopt")

            if os.path.exists(model_path):
                model_dict = paddle.load(model_path)
                for name, param in self._module.model.state_dict().items():
                    assert name in model_dict.keys(
                    ), "No param named `{}` was found in checkpoint file.".format(
                        name)

                    if param.dtype != model_dict[name].dtype:
                        model_dict[name] = model_dict[name].cast(param.dtype)

                self._module.model.set_state_dict(model_dict)
            else:
                raise ValueError("No optimizer checkpoint file found in %s." %
                                 model_path)

            if self.mode == 'train':
                if os.path.exists(opt_path):
                    opt_dict = paddle.load(opt_path)
                    self._optimizer.set_state_dict(opt_dict)
                else:
                    raise ValueError(
                        "No optimizer checkpoint file found in %s." % opt_path)

                if os.path.exists(meta_path):
                    meta_dict = paddle.load(meta_path)
                    self._load_recovery = {
                        'step': meta_dict['step'],
                        'epoch': meta_dict['epoch'],
                        'rng_state': meta_dict['cuda_rng_state']
                    }
                else:
                    raise ValueError("No meta checkpoint file found in %s." %
                                     meta_path)

            logger.info("successfully load checkpoints")
        else:
            logger.warning("`load` requires a valid value of `ckpt_dir`.")
            raise TypeError("`load` requires a valid value of `ckpt_dir`.")

    def export(self):
        self._module.model.eval()
        input_spec = self._module.input_spec()

        save_dir = os.path.join(self._output_dir,
                                "rank_{}".format(self._dp_rank))

        if not self._quant_mode:
            export_inference_model(self._module.model, input_spec, save_dir,
                                   'model')
        else:
            logger.info("export quantized model.")
            export_inference_model(
                self._module.model,
                input_spec,
                save_dir,
                'model',
                export_quant_model=True,
                quanter=self.quanter)

    def inference(self, data):
        if self._inference_engine is None:
            # parse TensorRT config
            tensorrt_config = None
            if 'TensorRT' in self._inference_configs:
                tensorrt_config = TensorRTConfig(
                    **self._inference_configs['TensorRT'])

            self._inference_engine = InferenceEngine(
                self._inference_configs['model_dir'],
                self._inference_configs['mp_degree'], tensorrt_config)

        return self._inference_engine.predict(data)

    def _print_summary(self):
        views_dict = {
            SummaryView.DeviceView: 'device',
            SummaryView.OverView: 'overview',
            SummaryView.ModelView: 'model',
            SummaryView.DistributedView: 'dist',
            SummaryView.KernelView: 'kernel',
            SummaryView.OperatorView: 'op',
            SummaryView.MemoryView: 'mem',
            SummaryView.MemoryManipulationView: 'memcpy',
            SummaryView.UDFView: 'udf',
        }

        default_views = [
            SummaryView.OverView,
            SummaryView.ModelView,
            SummaryView.KernelView,
            SummaryView.OperatorView,
        ]

        def gen_views(cfg):
            # print all summary view if detailed=True
            if self.profiler_config.get('detailed', False):
                return None

            views = []
            # override default view with user defined value if detailed=False
            for view in SummaryView:
                v = self.profiler_config.get('summary', {}).get(
                    views_dict[view], None)
                if v is True or (v is None and view in default_views):
                    views.append(view)

            return views or None

        self.profiler.summary(
            sorted_by=paddle.profiler.SortedKeys.GPUTotal,
            views=gen_views(self.profiler_config))

    def _profiler_done(self):
        if not self.profiler:
            return

        logger.info("Profiler finished, prepare to print summary...")

        self.profiler.stop()

        self._print_summary()
        profiler_log = self.profiler_config.get('profiler_log',
                                                './profiler_log')
        logger.info(
            "For more information please install visualdl and run it with following command:"
        )
        logger.info(
            "-------------------------------------------------------------------------------"
        )
        logger.info(f"visualdl --host 0.0.0.0 --logdir {profiler_log}")
        logger.info(
            "-------------------------------------------------------------------------------"
        )


================================================
FILE: ppfleetx/core/engine/inference_engine.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import numpy as np
from collections.abc import Sequence, Mapping

import paddle
import paddle.distributed.fleet as fleet

# TensorRT precisions
TRT_PRECISIONS = {
    'fp32': paddle.inference.PrecisionType.Float32,
    'fp16': paddle.inference.PrecisionType.Half,
    'int8': paddle.inference.PrecisionType.Int8,
}


class _StaticGuard(object):
    def __init__(self):
        pass

    def __enter__(self):
        paddle.enable_static()

    def __exit__(self, exc_type, exc_val, exc_tb):
        paddle.disable_static()


class TensorRTConfig(object):
    """
    TensorRT Inference Configuration

    Args:
        max_batch_size (int): The maxmum batch size of input data. Default 1
        workspace_size (int): The size of TensorRT workspace in bytes. Default 1<<30
        min_subgraph_size (int): The minimum subgraph node size to convert subgraph to TensorRT engine. Default 3
        precision (str): The inference precision, can be 'fp32', 'fp16' and 'int8'. Default 'fp16'
        use_static (bool): Whether to serialize and save TensorRT engine. Default False
        use_calib_mode (bool): Whether to use TensorRT calibration. Default False
        collect_shape (bool): Whether to collect dynamic shape. Default False
        shape_range_info_filename (str): Path to dynamic shape range file. Default None
    """

    def __init__(self,
                 max_batch_size=1,
                 workspace_size=1 << 30,
                 min_subgraph_size=3,
                 precision='fp16',
                 use_static=False,
                 use_calib_mode=False,
                 collect_shape=False,
                 shape_range_info_filename=None):
        self.max_batch_size = max_batch_size
        self.workspace_size = eval(workspace_size)
        self.min_subgraph_size = min_subgraph_size
        self.precision = precision
        self.use_static = use_static
        self.use_calib_mode = use_calib_mode
        self.shape_range_info_filename = shape_range_info_filename
        self.collect_shape = collect_shape

    @property
    def precision(self):
        return TRT_PRECISIONS[self._precision]

    @precision.setter
    def precision(self, value):
        print("value", value)
        assert value.lower() in ['fp32', 'fp16', 'int8'], \
            "TensorRT precision can only be 'fp32', 'fp16' or 'int8', " \
            "but got {}".format(value.lower())
        self._precision = value.lower()

    @property
    def collect_shape(self):
        return self._collect_shape

    @collect_shape.setter
    def collect_shape(self, value):
        if value:
            assert self.shape_range_info_filename is not None, \
                    "shape_range_info_filename should be set in " \
                    "collect_shape mode"
        else:
            assert self.shape_range_info_filename and \
                    os.path.isfile(self.shape_range_info_filename), \
                    "shape_range_info_filename {} is not a " \
                    "file".format(self.shape_range_info_filename)
        self._collect_shape = value


class InferenceEngine(object):
    """
    Model Parallel Inference Engine

    Args:
        model_dir (string): root directory of inference model
        mp_degree (int): model parallel size
        tensorrt_config (TensorRTConfig): configurations for TensorRT inference
    """

    def __init__(self,
                 model_dir,
                 mp_degree=1,
                 tensorrt_config=None,
                 device=None):
        self.model_dir = model_dir
        self.mp_degree = mp_degree
        self.tensorrt_config = tensorrt_config
        self.auto = False
        self.device = device

        for fname in os.listdir(model_dir):
            if "auto" in fname:
                self.auto = True
                break

        if mp_degree == 1:
            self.nranks = 1
            self.rank = 0
        else:
            self.nranks = fleet.worker_num()
            self.rank = fleet.worker_index()

        if not self.auto:
            self._check_model()

        self._static_guard = _StaticGuard()
        with self._static_guard:
            self._init_predictor()

    def _check_model(self):
        if not os.path.isdir(self.model_dir):
            raise ValueError('model_dir is not a directory')

        rank_path = os.path.join(self.model_dir, "rank_{}".format(self.rank))
        if not os.path.isdir(rank_path):
            raise ValueError('rank_{} directory not found'.format(self.rank))
        model_files = []
        param_files = []
        for fname in os.listdir(rank_path):
            if os.path.splitext(fname)[1] == '.pdmodel':
                model_files.append(fname)
            if os.path.splitext(fname)[1] == '.pdiparams':
                param_files.append(fname)

        def _check_and_get_file(files, tag):
            if len(files) == 0:
                raise ValueError("no {} file found under {}".format(tag,
                                                                    rank_path))
            elif len(files) > 1:
                raise ValueError("multiple {} file found under {}".format(
                    tag, rank_path))
            else:
                return os.path.join(self.model_dir,
                                    'rank_{}'.format(self.rank), files[0])

        self.model_file = _check_and_get_file(model_files, 'pdmodel')
        self.param_file = _check_and_get_file(param_files, 'pdiparams')

    def _generate_comm_init_config(self, rank, nranks):
        ring_id_to_ranks = ','.join(['0'] + [str(i) for i in range(nranks)])
        rank_to_ring_ids = ''.join(['{},0\n'.format(i) for i in range(nranks)])
        comm_str = '[ring_id -> ranks]\n' + ring_id_to_ranks + \
                    '\n[rank -> ring_ids]\n' + rank_to_ring_ids

        config_fname = "./.comm_config{}.csv".format(rank)
        if os.path.exists(config_fname):
            os.remove(config_fname)
        with open(config_fname, 'w') as f:
            f.write(comm_str)

        return config_fname

    def _init_predictor(self):
        if self.auto:
            self.model_file = os.path.join(
                self.model_dir, 'auto_dist{}.pdmodel'.format(self.rank))
            self.param_file = os.path.join(
                self.model_dir, 'auto_dist{}.pdiparams'.format(self.rank))
        config = paddle.inference.Config(self.model_file, self.param_file)

        config.enable_memory_optim()
        config.switch_ir_optim(True)
        if self.device:
            device_id = int(
                os.environ.get(f'FLAGS_selected_{self.device}s', 0))
            config.enable_custom_device(self.device, device_id)
        elif paddle.fluid.core.is_compiled_with_cuda():
            device_id = int(os.environ.get('FLAGS_selected_gpus', 0))
            config.enable_use_gpu(100, device_id)
        elif paddle.fluid.core.is_compiled_with_xpu():
            device_id = int(os.environ.get('FLAGS_selected_xpus', 0))
            config.enable_xpu()
            config.set_xpu_device_id(device_id)

        # distributed config
        if self.mp_degree > 1:
            trainer_endpoints = fleet.worker_endpoints()
            current_endpoint = trainer_endpoints[self.rank]

            dist_config = config.dist_config()
            dist_config.set_ranks(self.nranks, self.rank)
            dist_config.set_endpoints(trainer_endpoints, current_endpoint)
            dist_config.enable_dist_model(True)

            if self.auto:
                config_fname = os.path.join(self.model_dir, "rank_mapping.csv")
            else:
                config_fname = self._generate_comm_init_config(self.rank,
                                                               self.nranks)
            dist_config.set_comm_init_config(config_fname)
            config.set_dist_config(dist_config)

        # TensorRT config
        if self.tensorrt_config:
            config.enable_tensorrt_engine(
                max_batch_size=self.tensorrt_config.max_batch_size,
                workspace_size=self.tensorrt_config.workspace_size,
                min_subgraph_size=self.tensorrt_config.min_subgraph_size,
                precision_mode=self.tensorrt_config.precision,
                use_static=self.tensorrt_config.use_static,
                use_calib_mode=self.tensorrt_config.use_calib_mode)

            if self.tensorrt_config.collect_shape:
                config.collect_shape_range_info(
                    self.tensorrt_config.shape_range_info_filename)
            else:
                config.enable_tuned_tensorrt_dynamic_shape(
                    self.tensorrt_config.shape_range_info_filename, True)

        self.predictor = paddle.inference.create_predictor(config)

    def input_names(self):
        return self.predictor.get_input_names()

    def output_names(self):
        return self.predictor.get_output_names()

    def predict(self, data):
        # data in dict/list format
        with self._static_guard:
            if isinstance(data, Sequence):
                if len(data) != len(self.input_names()):
                    raise ValueError()
                for d, name in zip(data, self.input_names()):
                    handle = self.predictor.get_input_handle(name)
                    handle.copy_from_cpu(np.array(d.copy()))
            elif isinstance(data, Mapping):
                # key check
                for k, v in data.items():
                    handle = self.predictor.get_input_handle(k)
                    handle.copy_from_cpu(np.array(v))
            else:
                raise ValueError()

            self.predictor.run()
            return {name: self.predictor.get_output_handle(name).copy_to_cpu() \
                    for name in self.output_names()}


================================================
FILE: ppfleetx/core/module/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .basic_module import BasicModule


================================================
FILE: ppfleetx/core/module/basic_module.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The file has been adapted from lightning file:
# https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/module.py
# Git commit hash: 2d9e00fab64c8b19a8646f755a95bcb092aa710f
# We retain the following license from the original files:

# Copyright 2018-2021 William Falcon. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.

import paddle
import paddle.nn as nn


class BasicModule(nn.Layer):
    """

    """

    def __init__(self, configs, *args, **kwargs):
        self.configs = self.process_configs(configs)
        super().__init__(*args, **kwargs)
        self.model = self.get_model()

    def process_configs(self, configs):
        return configs

    def get_model(self):
        raise NotImplementedError

    def get_loss_fn(self):
        pass

    def pretreating_batch(self, batch):
        return batch

    def forward(self, *args, **kwargs):
        return super().forward(*args, **kwargs)

    def training_step(self, *args, **kwargs):
        raise NotImplementedError

    def training_step_end(self, *args, **kwargs):
        pass

    def validation_step(self, *args, **kwargs):
        pass

    def validation_step_end(self, *args, **kwargs):
        pass

    def test_step(self, *args, **kwargs):
        pass

    def test_step_end(self, *args, **kwargs):
        pass

    def backward(self, loss):
        loss.backward()

    def input_spec(self):
        raise NotImplementedError(
            "Please redefine Module.input_spec for model export")

    def inference_end(self, outputs):
        pass

    def training_epoch_end(self, *args, **kwargs):
        pass

    def validation_epoch_end(self, *args, **kwargs):
        pass


================================================
FILE: ppfleetx/data/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import random
import numpy as np

import paddle

from ppfleetx.data import dataset, sampler, utils
from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger


def build_auto_dataset(config, mode):
    """
    build dataset for auto parallel
    """
    assert mode in ['Train', 'Eval', 'Test'
                    ], "Dataset mode should be Train, Eval, Test"

    if mode not in config:
        return None

    dataset = build_dataset(config, mode)

    collate_fn = None
    if 'collate_fn' in config[mode].keys():
        collate_fn_cfg = config[mode].pop('collate_fn', None)
        if isinstance(collate_fn_cfg, str):
            collate_fn = getattr(
                utils, collate_fn_cfg) if collate_fn_cfg is not None else None
        elif isinstance(collate_fn_cfg, dict):
            collate_fn_class_name = collate_fn_cfg.pop("name")
            collate_fn = eval("utils.{}".format(collate_fn_class_name))(
                **collate_fn_cfg)
            logger.debug("build collate_fn({}) success...".format(collate_fn))

    dataset.collate_fn = collate_fn
    dataset.sample_split = config[mode].pop('sample_split', None)
    return dataset


def build_dataset(config, mode):
    # build dataset
    config_dataset = config[mode].dataset
    config_dataset = copy.deepcopy(config_dataset)
    dataset_name = config_dataset.pop('name')
    dataset = eval("dataset.{}".format(dataset_name))(**config_dataset)

    logger.debug("build dataset({}) success...".format(dataset))

    return dataset


def build_dataloader(config, mode):
    assert mode in ['Train', 'Eval', 'Test'
                    ], "Dataset mode should be Train, Eval, Test"

    if mode not in config:
        return None

    dataset = build_dataset(config, mode)

    batch_sampler = None
    # build sampler
    if 'sampler' in config[mode].keys():
        config_sampler = config[mode].sampler
        config_sampler = copy.deepcopy(config_sampler)
        sampler_name = config_sampler.pop("name")
        batch_sampler = eval("sampler.{}".format(sampler_name))(
            dataset, **config_sampler)
        logger.debug("build batch_sampler({}) success...".format(
            batch_sampler))

    collate_fn = None
    config_loader = {}
    # build dataloader
    if 'loader' in config[mode].keys():
        config_loader = config[mode].loader
        config_loader = copy.deepcopy(config_loader)

        collate_fn_cfg = config_loader.pop('collate_fn', None)
        if isinstance(collate_fn_cfg, str):
            collate_fn = getattr(
                utils, collate_fn_cfg) if collate_fn_cfg is not None else None
        elif isinstance(collate_fn_cfg, dict):
            collate_fn_class_name = collate_fn_cfg.pop("name")
            collate_fn = eval("utils.{}".format(collate_fn_class_name))(
                **collate_fn_cfg)
            logger.debug("build collate_fn({}) success...".format(collate_fn))

    def worker_init_fn(worker_id):
        """ set seed in subproces for dataloader when num_workers > 0"""
        np.random.seed(env.get_dp_seed() + worker_id)
        random.seed(env.get_dp_seed() + worker_id)

    data_loader = paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=collate_fn,
        worker_init_fn=worker_init_fn,
        **config_loader)

    logger.debug("build data_loader({}) success...".format(data_loader))
    return data_loader


================================================
FILE: ppfleetx/data/data_tools/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/data_tools/cpp/Makefile
================================================
CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
CPPFLAGS += $(shell $(PYTHON_BIN) -m pybind11 --includes)
CPPFLAGS += $(shell python3-config --includes)

LIBNAME = fast_index_map_helpers
LIBEXT = .so

default: $(LIBNAME)$(LIBEXT)

%$(LIBEXT): %.cpp
	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@


================================================
FILE: ppfleetx/data/data_tools/cpp/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/data_tools/cpp/compile.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import subprocess
path = os.path.abspath(os.path.dirname(__file__))


def compile_helper():
    """Compile helper function ar runtime. Make sure this
    is invoked on a single process."""
    import sys
    excutable = sys.executable
    ret = subprocess.run(['make', '-C', path, f'PYTHON_BIN={excutable}'])
    if ret.returncode != 0:
        print("Making C++ dataset helpers module failed, exiting.")
        sys.exit(1)


================================================
FILE: ppfleetx/data/data_tools/cpp/fast_index_map_helpers.cpp
================================================
/*
 coding=utf-8
 Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */

/* Helper methods for fast index mapping builds */

#include <algorithm>
#include <iostream>
#include <limits>
#include <random>
#include <stdexcept>

#include <math.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>

namespace py = pybind11;
using namespace std;

const int32_t LONG_SENTENCE_LEN = 512;

void build_blending_indices(
    py::array_t<uint8_t> &dataset_index,        // NOLINT
    py::array_t<int64_t> &dataset_sample_index, // NOLINT
    const py::array_t<double> &weights, const int32_t num_datasets,
    const int64_t size, const bool verbose) {
  /* Given multiple datasets and a weighting array, build samples
   such that it follows those wieghts.*/

  if (verbose) {
    std::cout << "> building indices for blendable datasets ..." << std::endl;
  }

  // Get the pointer access without the checks.
  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
  auto weights_ptr = weights.unchecked<1>();

  // Initialize buffer for number of samples used for each dataset.
  int64_t current_samples[num_datasets];
  for (int64_t i = 0; i < num_datasets; ++i) {
    current_samples[i] = 0;
  }

  // For each sample:
  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
    // Determine where the max error in sampling is happening.
    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
    int64_t max_error_index = 0;
    double max_error = weights_ptr[0] * sample_idx_double -
                       static_cast<double>(current_samples[0]);
    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
      double error = weights_ptr[dataset_idx] * sample_idx_double -
                     static_cast<double>(current_samples[dataset_idx]);
      if (error > max_error) {
        max_error = error;
        max_error_index = dataset_idx;
      }
    }

    // Populate the indices.
    dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];

    // Update the total samples.
    current_samples[max_error_index] += 1;
  }

  // print info
  if (verbose) {
    std::cout << " > sample ratios:" << std::endl;
    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
                   static_cast<double>(size);
      std::cout << "   dataset " << dataset_idx
                << ", input: " << weights_ptr[dataset_idx]
                << ", achieved: " << ratio << std::endl;
    }
  }
}

py::array build_sample_idx(const py::array_t<int64_t> &sizes_,
                           const py::array_t<int64_t> &doc_idx_,
                           const int32_t seq_length, const int32_t num_epochs,
                           const int64_t tokens_per_epoch) {
  /* Sample index (sample_idx) is used for gpt2 like dataset for which
     the documents are flattened and the samples are built based on this
     1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
     where [..., 0] contains the index into `doc_idx` and [..., 1] is the
     starting offset in that document.*/

  // Consistency checks.
  assert(seq_length > 1);
  assert(num_epochs > 0);
  assert(tokens_per_epoch > 1);

  // Remove bound checks.
  auto sizes = sizes_.unchecked<1>();
  auto doc_idx = doc_idx_.unchecked<1>();

  // Mapping and it's length (1D).
  int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
  int64_t *sample_idx = new int64_t[2 * (num_samples + 1)];

  cout << "    using:" << endl << std::flush;
  cout << "     number of documents:       " << doc_idx_.shape(0) / num_epochs
       << endl
       << std::flush;
  cout << "     number of epochs:          " << num_epochs << endl
       << std::flush;
  cout << "     sequence length:           " << seq_length << endl
       << std::flush;
  cout << "     total number of samples:   " << num_samples << endl
       << std::flush;

  // Index into sample_idx.
  int64_t sample_index = 0;
  // Index into doc_idx.
  int64_t doc_idx_index = 0;
  // Begining offset for each document.
  int64_t doc_offset = 0;
  // Start with first document and no offset.
  sample_idx[2 * sample_index] = doc_idx_index;
  sample_idx[2 * sample_index + 1] = doc_offset;
  ++sample_index;

  while (sample_index <= num_samples) {
    // Start with a fresh sequence.
    int64_t remaining_seq_length = seq_length + 1;
    while (remaining_seq_length != 0) {
      // Get the document length.
      auto doc_id = doc_idx[doc_idx_index];
      auto doc_length = sizes[doc_id] - doc_offset;
      // And add it to the current sequence.
      remaining_seq_length -= doc_length;
      // If we have more than a full sequence, adjust offset and set
      // remaining length to zero so we return from the while loop.
      // Note that -1 here is for the same reason we have -1 in
      // `_num_epochs` calculations.
      if (remaining_seq_length <= 0) {
        doc_offset += (remaining_seq_length + doc_length - 1);
        remaining_seq_length = 0;
      } else {
        // Otherwise, start from the begining of the next document.
        ++doc_idx_index;
        doc_offset = 0;
      }
    }
    // Record the sequence.
    sample_idx[2 * sample_index] = doc_idx_index;
    sample_idx[2 * sample_index + 1] = doc_offset;
    ++sample_index;
  }

  // Method to deallocate memory.
  py::capsule free_when_done(sample_idx, [](void *mem_) {
    int64_t *mem = reinterpret_cast<int64_t *>(mem_);
    delete[] mem;
  });

  // Return the numpy array.
  const auto byte_size = sizeof(int64_t);
  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
                   {2 * byte_size, byte_size}, // C-style contiguous strides
                   sample_idx,                 // the data pointer
                   free_when_done);            // numpy array references
}

inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
                                     const int32_t max_length,
                                     std::mt19937 &rand32_gen) {
  /* Training sample length. */
  if (short_seq_ratio == 0) {
    return max_length;
  }
  const auto random_number = rand32_gen();
  if ((random_number % short_seq_ratio) == 0) {
    return 2 + random_number % (max_length - 1);
  }
  return max_length;
}

template <typename DocIdx>
py::array
build_mapping_impl(const py::array_t<int64_t> &docs_,
                   const py::array_t<int32_t> &sizes_, const int32_t num_epochs,
                   const uint64_t max_num_samples, const int32_t max_seq_length,
                   const double short_seq_prob, const int32_t seed,
                   const bool verbose, const int32_t min_num_sent) {
  /* Build a mapping of (start-index, end-index, sequence-length) where
     start and end index are the indices of the sentences in the sample
     and sequence-length is the target sequence length.
  */

  // Consistency checks.
  assert(num_epochs > 0);
  assert(max_seq_length > 1);
  assert(short_seq_prob >= 0.0);
  assert(short_seq_prob <= 1.0);
  assert(seed > 0);

  // Remove bound checks.
  auto docs = docs_.unchecked<1>();
  auto sizes = sizes_.unchecked<1>();

  // For efficiency, convert probability to ratio. Note: rand() generates int.
  int32_t short_seq_ratio = 0;
  if (short_seq_prob > 0) {
    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
  }

  if (verbose) {
    const auto sent_start_index = docs[0];
    const auto sent_end_index = docs[docs_.shape(0) - 1];
    const auto num_sentences = sent_end_index - sent_start_index;
    cout << "    using:" << endl << std::flush;
    cout << "     number of documents:            " << docs_.shape(0) - 1
         << endl
         << std::flush;
    cout << "     sentences range:                [" << sent_start_index << ", "
         << sent_end_index << ")" << endl
         << std::flush;
    cout << "     total number of sentences:      " << num_sentences << endl
         << std::flush;
    cout << "     number of epochs:               " << num_epochs << endl
         << std::flush;
    cout << "     maximum number of samples:      " << max_num_samples << endl
         << std::flush;
    cout << "     maximum sequence length:        " << max_seq_length << endl
         << std::flush;
    cout << "     minimum sentences num:          " << min_num_sent << endl
         << std::flush;
    cout << "     short sequence probability:     " << short_seq_prob << endl
         << std::flush;
    cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
         << std::flush;
    cout << "     seed:                           " << seed << endl
         << std::flush;
  }

  // Mapping and it's length (1D).
  int64_t num_samples = -1;
  DocIdx *maps = NULL;

  // Perform two iterations, in the first iteration get the size
  // and allocate memory and in the second iteration populate the map.
  bool second = false;
  for (int32_t iteration = 0; iteration < 2; ++iteration) {
    // Set the seed so both iterations produce the same results.
    std::mt19937 rand32_gen(seed);

    // Set the flag on second iteration.
    second = (iteration == 1);

    // Counters:
    uint64_t empty_docs = 0;
    uint64_t one_sent_docs = 0;
    uint64_t long_sent_docs = 0;

    // Current map index.
    uint64_t map_index = 0;

    // For each epoch:
    for (int32_t epoch = 0; epoch < num_epochs; ++epoch) {
      if (map_index >= max_num_samples) {
        if (verbose && (!second)) {
          cout << "    reached " << max_num_samples << " samples after "
               << epoch << " epochs ..." << endl
               << std::flush;
        }
        break;
      }
      if (epoch > 0 && map_index == 0) {
        cout << endl
             << "     No available documtment find this dataset." << endl
             << std::flush;
        throw std::invalid_argument(
            "Invalid dataset! the document should be with more than " +
            std::to_string(min_num_sent) + " scentences.");
      }
      // For each document:
      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {
        // Document sentences are in [sent_index_first, sent_index_last)
        const auto sent_index_first = docs[doc];
        const auto sent_index_last = docs[doc + 1];

        // At the begining of the document previous index is the
        // start index.
        auto prev_start_index = sent_index_first;

        // Remaining documents.
        auto num_remain_sent = sent_index_last - sent_index_first;

        // Some bookkeeping
        if ((epoch == 0) && (!second)) {
          if (num_remain_sent == 0) {
            ++empty_docs;
          }
          if (num_remain_sent == 1) {
            ++one_sent_docs;
          }
        }

        // Detect documents with long sentences.
        bool contains_long_sentence = false;
        if (num_remain_sent > 1) {
          for (auto sent_index = sent_index_first; sent_index < sent_index_last;
               ++sent_index) {
            if (sizes[sent_index] > LONG_SENTENCE_LEN) {
              if ((epoch == 0) && (!second)) {
                ++long_sent_docs;
              }
              contains_long_sentence = true;
              break;
            }
          }
        }

        // If we have more than two sentences.
        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
          // Set values.
          auto seq_len = int32_t{0};
          auto num_sent = int32_t{0};
          auto target_seq_len = get_target_sample_len(
              short_seq_ratio, max_seq_length, rand32_gen);

          // Loop through sentences.
          for (auto sent_index = sent_index_first; sent_index < sent_index_last;
               ++sent_index) {
            // Add the size and number of sentences.
            seq_len += sizes[sent_index];
            ++num_sent;
            --num_remain_sent;

            // If we have reached the target length.
            // and if not only one sentence is left in the document.
            // and if we have at least two sentneces.
            // and if we have reached end of the document.
            if (((seq_len >= target_seq_len) && (num_remain_sent > 1) &&
                 (num_sent >= min_num_sent)) ||
                (num_remain_sent == 0)) {
              // Check for overflow.
              if ((3 * map_index + 2) > std::numeric_limits<int64_t>::max()) {
                cout << "number of samples exceeded maximum "
                     << "allowed by type int64: "
                     << std::numeric_limits<int64_t>::max() << endl;
                throw std::overflow_error("Number of samples");
              }

              // Populate the map.
              if (second) {
                const auto map_index_0 = 3 * map_index;
                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
              }

              // Update indices / counters.
              ++map_index;
              prev_start_index = sent_index + 1;
              target_seq_len = get_target_sample_len(
                  short_seq_ratio, max_seq_length, rand32_gen);
              seq_len = 0;
              num_sent = 0;
            }

          } // for (auto sent_index=sent_index_first; ...
        }   // if (num_remain_sent > 1) {
      }     // for (int doc=0; doc < num_docs; ++doc) {
    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {

    if (!second) {
      if (verbose) {
        cout << "   number of empty documents: " << empty_docs << endl
             << std::flush;
        cout << "   number of documents with one sentence: " << one_sent_docs
             << endl
             << std::flush;
        cout << "   number of documents with long sentences: " << long_sent_docs
             << endl
             << std::flush;
        cout << "   will create mapping for " << map_index << " samples" << endl
             << std::flush;
      }
      assert(maps == NULL);
      assert(num_samples < 0);
      maps = new DocIdx[3 * map_index];
      num_samples = static_cast<int64_t>(map_index);
    }

  } // for (int iteration=0; iteration < 2; ++iteration) {

  // Shuffle.
  // We need a 64 bit random number generator as we might have more
  // than 2 billion samples.
  std::mt19937_64 rand64_gen(seed + 1);
  for (auto i = (num_samples - 1); i > 0; --i) {
    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
    const auto i0 = 3 * i;
    const auto j0 = 3 * j;
    // Swap values.
    swap(maps[i0], maps[j0]);
    swap(maps[i0 + 1], maps[j0 + 1]);
    swap(maps[i0 + 2], maps[j0 + 2]);
  }

  // Method to deallocate memory.
  py::capsule free_when_done(maps, [](void *mem_) {
    DocIdx *mem = reinterpret_cast<DocIdx *>(mem_);
    delete[] mem;
  });

  // Return the numpy array.
  const auto byte_size = sizeof(DocIdx);
  return py::array(std::vector<int64_t>{num_samples, 3}, // shape
                   {3 * byte_size, byte_size}, // C-style contiguous strides
                   maps,                       // the data pointer
                   free_when_done);            // numpy array references
}

py::array build_mapping(const py::array_t<int64_t> &docs_,
                        const py::array_t<int> &sizes_, const int num_epochs,
                        const uint64_t max_num_samples,
                        const int max_seq_length, const double short_seq_prob,
                        const int seed, const bool verbose,
                        const int32_t min_num_sent) {
  if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
    if (verbose) {
      cout << "    using uint64 for data mapping..." << endl << std::flush;
    }
    return build_mapping_impl<uint64_t>(
        docs_, sizes_, num_epochs, max_num_samples, max_seq_length,
        short_seq_prob, seed, verbose, min_num_sent);
  } else {
    if (verbose) {
      cout << "    using uint32 for data mapping..." << endl << std::flush;
    }
    return build_mapping_impl<uint32_t>(
        docs_, sizes_, num_epochs, max_num_samples, max_seq_length,
        short_seq_prob, seed, verbose, min_num_sent);
  }
}

template <typename DocIdx>
py::array build_blocks_mapping_impl(
    const py::array_t<int64_t> &docs_, const py::array_t<int32_t> &sizes_,
    const py::array_t<int32_t> &titles_sizes_, const int32_t num_epochs,
    const uint64_t max_num_samples, const int32_t max_seq_length,
    const int32_t seed, const bool verbose, const bool use_one_sent_blocks) {
  /* Build a mapping of (start-index, end-index, sequence-length) where
     start and end index are the indices of the sentences in the sample
     and sequence-length is the target sequence length.
  */

  // Consistency checks.
  assert(num_epochs > 0);
  assert(max_seq_length > 1);
  assert(seed > 0);

  // Remove bound checks.
  auto docs = docs_.unchecked<1>();
  auto sizes = sizes_.unchecked<1>();
  auto titles_sizes = titles_sizes_.unchecked<1>();

  if (verbose) {
    const auto sent_start_index = docs[0];
    const auto sent_end_index = docs[docs_.shape(0) - 1];
    const auto num_sentences = sent_end_index - sent_start_index;
    cout << "    using:" << endl << std::flush;
    cout << "     number of documents:            " << docs_.shape(0) - 1
         << endl
         << std::flush;
    cout << "     sentences range:                [" << sent_start_index << ", "
         << sent_end_index << ")" << endl
         << std::flush;
    cout << "     total number of sentences:      " << num_sentences << endl
         << std::flush;
    cout << "     number of epochs:               " << num_epochs << endl
         << std::flush;
    cout << "     maximum number of samples:      " << max_num_samples << endl
         << std::flush;
    cout << "     maximum sequence length:        " << max_seq_length << endl
         << std::flush;
    cout << "     seed:                           " << seed << endl
         << std::flush;
  }

  // Mapping and its length (1D).
  int64_t num_samples = -1;
  DocIdx *maps = NULL;

  // Acceptable number of sentences per block.
  int min_num_sent = 2;
  if (use_one_sent_blocks) {
    min_num_sent = 1;
  }

  // Perform two iterations, in the first iteration get the size
  // and allocate memory and in the second iteration populate the map.
  bool second = false;
  for (int32_t iteration = 0; iteration < 2; ++iteration) {
    // Set the flag on second iteration.
    second = (iteration == 1);

    // Current map index.
    uint64_t map_index = 0;

    uint64_t empty_docs = 0;
    uint64_t one_sent_docs = 0;
    uint64_t long_sent_docs = 0;
    // For each epoch:
    for (int32_t epoch = 0; epoch < num_epochs; ++epoch) {
      // assign every block a unique id
      int32_t block_id = 0;

      if (map_index >= max_num_samples) {
        if (verbose && (!second)) {
          cout << "    reached " << max_num_samples << " samples after "
               << epoch << " epochs ..." << endl
               << std::flush;
        }
        break;
      }
      // For each document:
      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {
        // Document sentences are in [sent_index_first, sent_index_last)
        const auto sent_index_first = docs[doc];
        const auto sent_index_last = docs[doc + 1];
        const auto target_seq_len = max_seq_length - titles_sizes[doc];

        // At the begining of the document previous index is the
        // start index.
        auto prev_start_index = sent_index_first;

        // Remaining documents.
        auto num_remain_sent = sent_index_last - sent_index_first;

        // Some bookkeeping
        if ((epoch == 0) && (!second)) {
          if (num_remain_sent == 0) {
            ++empty_docs;
          }
          if (num_remain_sent == 1) {
            ++one_sent_docs;
          }
        }
        // Detect documents with long sentences.
        bool contains_long_sentence = false;
        if (num_remain_sent >= min_num_sent) {
          for (auto sent_index = sent_index_first; sent_index < sent_index_last;
               ++sent_index) {
            if (sizes[sent_index] > LONG_SENTENCE_LEN) {
              if ((epoch == 0) && (!second)) {
                ++long_sent_docs;
              }
              contains_long_sentence = true;
              break;
            }
          }
        }
        // If we have enough sentences and no long sentences.
        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
          // Set values.
          auto seq_len = int32_t{0};
          auto num_sent = int32_t{0};

          // Loop through sentences.
          for (auto sent_index = sent_index_first; sent_index < sent_index_last;
               ++sent_index) {
            // Add the size and number of sentences.
            seq_len += sizes[sent_index];
            ++num_sent;
            --num_remain_sent;

            // If we have reached the target length.
            // and there are an acceptable number of sentences left
            // and if we have at least the minimum number of sentences.
            // or if we have reached end of the document.
            if (((seq_len >= target_seq_len) &&
                 (num_remain_sent >= min_num_sent) &&
                 (num_sent >= min_num_sent)) ||
                (num_remain_sent == 0)) {
              // Populate the map.
              if (second) {
                const auto map_index_0 = 4 * map_index;
                // Each sample has 4 items: the starting sentence index, ending
                // sentence index,
                // the index of the document from which the block comes (used
                // for fetching titles)
                // and the unique id of the block (used for creating block
                // indexes)

                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
              }

              // Update indices / counters.
              ++map_index;
              ++block_id;
              prev_start_index = sent_index + 1;
              seq_len = 0;
              num_sent = 0;
            }
          } // for (auto sent_index=sent_index_first; ...
        }   // if (num_remain_sent > 1) {
      }     // for (int doc=0; doc < num_docs; ++doc) {
    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {

    if (!second) {
      if (verbose) {
        cout << "   number of empty documents: " << empty_docs << endl
             << std::flush;
        cout << "   number of documents with one sentence: " << one_sent_docs
             << endl
             << std::flush;
        cout << "   number of documents with long sentences: " << long_sent_docs
             << endl
             << std::flush;
        cout << "   will create mapping for " << map_index << " samples" << endl
             << std::flush;
      }
      assert(maps == NULL);
      assert(num_samples < 0);
      maps = new DocIdx[4 * map_index];
      num_samples = static_cast<int64_t>(map_index);
    }

  } // for (int iteration=0; iteration < 2; ++iteration) {

  // Shuffle.
  // We need a 64 bit random number generator as we might have more
  // than 2 billion samples.
  std::mt19937_64 rand64_gen(seed + 1);
  for (auto i = (num_samples - 1); i > 0; --i) {
    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
    const auto i0 = 4 * i;
    const auto j0 = 4 * j;
    // Swap values.
    swap(maps[i0], maps[j0]);
    swap(maps[i0 + 1], maps[j0 + 1]);
    swap(maps[i0 + 2], maps[j0 + 2]);
    swap(maps[i0 + 3], maps[j0 + 3]);
  }

  // Method to deallocate memory.
  py::capsule free_when_done(maps, [](void *mem_) {
    DocIdx *mem = reinterpret_cast<DocIdx *>(mem_);
    delete[] mem;
  });

  // Return the numpy array.
  const auto byte_size = sizeof(DocIdx);
  return py::array(std::vector<int64_t>{num_samples, 4}, // shape
                   {4 * byte_size, byte_size}, // C-style contiguous strides
                   maps,                       // the data pointer
                   free_when_done);            // numpy array references
}

py::array build_blocks_mapping(
    const py::array_t<int64_t> &docs_, const py::array_t<int> &sizes_,
    const py::array_t<int> &titles_sizes_, const int num_epochs,
    const uint64_t max_num_samples, const int max_seq_length, const int seed,
    const bool verbose, const bool use_one_sent_blocks) {
  if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
    if (verbose) {
      cout << "    using uint64 for data mapping..." << endl << std::flush;
    }
    return build_blocks_mapping_impl<uint64_t>(
        docs_, sizes_, titles_sizes_, num_epochs, max_num_samples,
        max_seq_length, seed, verbose, use_one_sent_blocks);
  } else {
    if (verbose) {
      cout << "    using uint32 for data mapping..." << endl << std::flush;
    }
    return build_blocks_mapping_impl<uint32_t>(
        docs_, sizes_, titles_sizes_, num_epochs, max_num_samples,
        max_seq_length, seed, verbose, use_one_sent_blocks);
  }
}

PYBIND11_MODULE(fast_index_map_helpers, m) {
  m.def("build_mapping", &build_mapping);
  m.def("build_blocks_mapping", &build_blocks_mapping);
  m.def("build_sample_idx", &build_sample_idx);
  m.def("build_blending_indices", &build_blending_indices);
}


================================================
FILE: ppfleetx/data/data_tools/ernie/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/README.md
================================================
# PaddleFleetX 预训练数据准备流程

本示例致力于打造基于PaddleFleetX预训练模型的最佳实践。


我们将预训练数据过程划分为以下部分

- 原始数据转换，原始文本转换为jsonl的json字符串格式。
- 数据ID化，断句、分词、tokenize转化为token id格式。
- 训练index文件生成，生成train、valid、test的每个样本索引。
- token动态mask(可选)，python 层实时mask文本。

本目录下主要包含一下文件：
```
├── create_pretraining_data.py
├── dataset_utils.py
├── ernie_dataset.py
├── helpers.cpp
├── Makefile
├── README.md
└── trans_to_json.py

```
其中，`trans_to_json.py`是原始数据转化的脚本，将数据转化为json串格式。
`create_pretraining_data.py`将jsonl文本，断句、分词后，tokenizer转化为token id。
`dataset_utils.py`中包含了index生成、动态mask的实现。
`ernie_dataset.py`通过调用`dataset_utils.py`的一些函数，产生ernie的输入dataset。


### 环境依赖

 - tqdm
 - numpy
 - pybind11
 - tool_helpers
 - lac (可选)
 - zstandard (可选)

安装命令`pip install tqdm numpy pybind11 tool_helpers lac zstandard`。另，部分功能需要`g++>=4.8`编译支持


## 训练全流程数据Pipeline

飞桨是自主研发、功能完备、开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体

|步骤|阶段&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|数据格式| 样例|
|-|-|-|-|
| 0️⃣初始状态 | -|原始数据： <br/> **每个doc之间用空行间隔开** <br/> - 中文，默认每句换行符，作为句子结束。<br/> - 英文，默认使用nltk判断句子结束  | ```飞桨是功能完备、开源开放的产业级深度学习平台。``` <br/> ```飞桨拥有核心训练和推理框架、基础模型库。``` <br/><br/> ```PaddleNLP是自然语言处理领域的优秀工具。```  |
|1️⃣原始数据转换<br/>`trans_to_json.py`|预处理 <br>输入：0️⃣初始状态 <br>输出：jsonl|jsonl格式：每个doc对应一行json字符串| ```{"text": "飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有..."}```<br/>```{"text": "PaddleNLP是自然语言..."}```
|❇️(**可选**)数据中文分词<br/>`words_segmentation.py`|语料分词：中文WWM <br>输入：jsonl  <br> 输出：0️⃣初始状态| 将jsonl格式的数据，恢复成分词后的原始格式数据 <br> | ```飞桨 是 功能 完备、开源 开放的 产业级 深度学习 平台。``` <br/> ```飞桨 拥有 核心 训练和推理 框架、基础 模型库。``` <br/><br/> ```PaddleNLP 是 自然语言处理领域 的 优秀工具。```
|2️⃣数据ID化<br/>`create_pretrain_data.py`|预处理| npy格式：数据id化后的token id <br/>npz格式：数据句子、文章位置索引 | -
|3️⃣训练index文件生成|训练启动|npy格式：<br/> 根据训练步数max_steps生成<br/>train、valid、test的每个样本索引文件| -
|4️⃣token动态mask（可选）| Dataset取数据 | 无 |-


注意：
- **❇️(**可选**)数据中文分词** 是中文预训练做 WWM 的可选步骤
  - 当你的数据比较少时，分词耗时较少，不需要词步骤。直接在`create_pretrain_data.py`步骤中分词即可。
  - 目的是为了提前分词，加快后续数据ID转化步骤。
  - 如果这里输入的是 jsonl格式文件，最好为多文件，`trans_to_json.py` 时候开启`no-merge`选项。
  - 当你的数据集比较大，或者需要尝试多次转换数据的时候，提前分词可以避免`create_pretrain_data.py`时每次都运行一次分词程序。
- 转换后，需要重新 进行步骤 1️⃣`原始数据转换 trans_to_json.py`，最后2️⃣`数据ID化`步骤设置`--cn_splited=True`参数。
- 2️⃣`数据ID化`也可以在转化ID的同时，一起实现分词。不需要❇️`数据中文分词`步骤。


## 数据教程汇总

针对目前开源的数据集，PaddleFleetX提供了详细的数据教程，点击对应数据集的链接，即可开始进行数据制作：

| 名称 | 文本类型 | 纯文本大小 | 适配模型
|-|-|-|-|
| [CLUECorpusSmall](./docs/CLUECorpusSmall.md)| 中文 | 14GB | ERNIE
| [OpenWebText2](./docs/OpenWebText2.md) | 英文 | 70GB | GPT
| [WuDaoCorpus2.0 Base](./docs/WuDaoCorpusBase.md)| 中文 |  200GB | ERNIE
| [CLUECorpus2020](./docs/CLUECorpus2020.md)| 中文 | 200GB | ERNIE

## ERNIE预训练详细准备

下面以ERNIE预训练为例，简要介绍一下预训练的全流程。

### 原始数据
首先下载样例数据：
```
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

mkdir preprocess && cd preprocess
wget https://bj.bcebos.com/paddlenlp/models/transformers/data_tools/baike.txt
cd ..
```

### 原始数据转换 jsonl 格式
使用`trans_to_json.py`转化为json串格式，下面是脚本的使用说明
```
optional arguments:
  -h, --help            show this help message and exit
  --input_path INPUT_PATH
                        Path to you raw files. Folder or file path.
                        必须设置，可以是文件夹或者单个文件。文件夹中的目录默认最多搜索两层子目录。
  --output_path OUTPUT_PATH
                        Path to save the output json files.
                        必须设置，输出文件的名字。
  --json_key JSON_KEY   The content key of json file.
                        建议不修改，默认的key是text
  --doc_spliter DOC_SPLITER
                        Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank.
                        根据实际情况修改，默认空行作为文章换行符。
  --min_doc_length MIN_DOC_LENGTH
                        Minimal char of a documment.
                        可选。过滤掉长度多短的文章，默认值10
  --workers WORKERS     Number of worker processes to launch
                        可选。多进程转化文件，适用于 input_path 中包含的文件数据较多的情况。每个文件，分配给不同worker处理
  --log_interval LOG_INTERVAL
                        Interval between progress updates.
                        可选。此处的interval是值处理完文件个数的间隔。
  --no-merge            Don't merge the file.
                        可选。默认不开启这个选项，默认每个文件转换的jsonl文本，会拼接成到同一个文件。
  --no-shuffle          Don't shuffle the file.
                        可选。默认不开启这个选项，默认对处理完进行shuffle。
```
根据说明，我们使用下面简单命令，可以得到`baike_sample.jsonl`文件。此处，我们对文章所有doc进行了shuffle。
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  --input_path ./preprocess --output_path preprocess/baike_sample

#查看数据
head -1 baike_sample.jsonl
{"text": "中国效仿西方发展工业的过程，于中华民国国民政府成立后至中日战争开战前夕已顺畅发展，尽管其间受到内外因素的多重干扰。尔后直至中日战争和国共战争的结束，
中国始有较为长期的和平发展时期。\n1980年代以来，邓小平政府宣布改革开放，开始实行社会主义市场经济并推行经济体制改革。中国大陆近年至2010年，GDP超过72000亿美元，
已经成为美国之后的世界第二经济大国，普遍认为中国是世界上发展速度最快的经济体，但是人均国民生产总值仍位于世界中等水平（第89位），并逐渐受到资源限制和贫富差距加
大的制约。中华人民共和国省份中，广东为GDP最高的第一强省，浙江为人均收入最高的第一富省。中国大陆、香港、澳门、台湾之间的经济联系在全球化的过程中日益紧密。\n"}
```

### 数据ID化
本部分，我们使用 `create_pretraining_data.py` 脚本将前面得到的 `baike_sample.jsonl` 进行tokenize id化处理。
```
optional arguments:
  -h, --help            show this help message and exit
  --model_name MODEL_NAME
                        What model to use.
                        必须设置，如：ernie-1.0-base-zh, 可以参考已有的模型名称 https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer
  --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer}
                        What type of tokenizer to use.
                        模型对应的tokenizer, 目前暂时只支持 ERNIE，BERT，GPT
data input/output:
  --input_path INPUT_PATH
                        Path to input JSON files.
                        必须设置，输入文件jsonl的目录
  --output_prefix OUTPUT_PREFIX
                        Output prefix to store output file.
                        必须设置，输出文件的名称。
                        假设名称为XXX，则会输出 XXX_ids.npy, XXX_idx.npz 两个文件。
                        npy文件，数据id化后的token ids; npz文件，数据句子、文章位置索引。
  --data_format {JSON}  Only support json format for now. One document per line.
                        不需要设置。目前默认处理jsonl数据格式
  --json_key JSON_KEY   For JSON format. Space separate listed of keys to extract from json
                        文本串json的key值。同前面trans_to_json.py的json_key，默认text为key
  --split_sentences     Split documents into sentences.
                        是否需要将文章划分成句子。一般而言，GPT不需要，BERT/ERNIE模型需要

chinese words:
  --chinese             Is corpus need words segmentation step for chinese words.
                        中文情形必须设置。处理的文本类型是否是中文。
  --cn_whole_word_segment
                        Is corpus need words segmentation step for chinese words WWM.
                        可选。是否需要WWM策略。一般而言，BERT/ERNIE模型需要，GPT不需要。
  --cn_seg_func {lac,seg,jieba}
                        Words segment function for chinese words.
                        默认jieba，jieba速度较快，lac模型更准确，计算量高。
  --cn_splited          Is chinese corpus is splited in to words.
                        分词后的文本，可选。设置此选项则，cn_seg_func不起作用。
                        例如分词后文本串 "中国 效仿 西方 发展 工业 的过 程"
  --cn_split_dimer CN_SPLIT_DIMER
                        Split dimer between chinese words.
                        配合cn_splited使用，默认空格表示分词间隔。

common config:
  --append_eos          Append an <eos> token to the end of a document.
                        gpt模型专用，gpt设置此选项，表示doc结束。
  --log_interval LOG_INTERVAL
                        Interval between progress updates
                        打印日志间隔，interval表示处理 文本行数/doc数的 间隔。
  --workers WORKERS     Number of worker processes to launch
                        处理文本id化的进程个数。
```
通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`baike_sample_ids.npy`, 文章索引信息`baike_sample_idx.npz`.
```
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python -u  ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \
    --model_name ernie-1.0-base-zh \
    --tokenizer_name ErnieTokenizer \
    --input_path preprocess/baike_sample.jsonl \
    --split_sentences\
    --chinese \
    --cn_whole_word_segment \
    --output_prefix preprocess/baike_sample  \
    --workers 1 \
    --log_interval 5
```
1. 如果您使用已经分好词的语料，可以设置 --cn_splited 为 True，同时指定--cn_split_dimer如空格。
2. 使用自定义词表的话，请指定model_name为词表所在的文件夹地址。


### ERNIE 预训练开始
得到了处理好的训练数据，拷贝到data目录，即可开始ERNIE模型预训练。
```
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

mkdir data
mv ./preprocess/baike_sample* ./data

sh ./projects/ernie/pretrain_ernie_base.sh
# 建议修改 pretrain_ernie_base.sh 中的配置，将max_steps设置小一些。
```
代码说明：

- ernie预训练使用的 dataset 代码文件在 `ernie_dataset.py`
- 数据集index生成，动态mask相关代码实现在`dataset_utils.py`

用户可以根据自己的需求，灵活修改mask方式。具体可以参考`dataset_utils.py`中`create_masked_lm_predictions`函数。
可以自定义的选项有do_whole_word_mask, favor_longer_ngram, do_permutation, geometric_dist等，
可以参考[Megatron](https://github.com/NVIDIA/Megatron-LM)使用这些lm_mask策略。

### FAQ

#### C++代码编译失败怎么办？
- 请先检查pybind11包是否安装，g++、make工具是否正常。
- 编译失败可能是本文件夹下的Makefile命令出现了一些问题。可以将Makefile中的python3、python3-config设置成完全的路径，如/usr/bin/python3.7。

## 参考内容

注: 大部分数据流程，参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)，特此表达感谢。


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import io
import re
import argparse
import json
import multiprocessing
import sys
import time

import numpy as np
from tqdm import tqdm

import paddlenlp.transformers as tfs

try:
    import nltk
    nltk_available = True
except ImportError:
    nltk_available = False


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name', type=str, required=True, help='What model to use.')
    parser.add_argument(
        '--tokenizer_name',
        type=str,
        required=True,
        choices=[
            'ErnieTokenizer', 'BertTokenizer', 'GPTTokenizer',
            'GPTChineseTokenizer', 'ElectraTokenizer'
        ],
        help='What type of tokenizer to use.')
    group = parser.add_argument_group(title='data input/output')
    group.add_argument(
        '--input_path',
        type=str,
        required=True,
        help='Path to input JSON files.')
    group.add_argument(
        '--output_prefix',
        type=str,
        required=True,
        help='Output prefix to store output file.')
    group.add_argument(
        '--data_format',
        type=str,
        default='text',
        choices=['JSON'],
        help='Only support json format for now. One document per line.')
    group.add_argument(
        '--json_key',
        type=str,
        default='text',
        help='For JSON format. Space separate listed of keys to extract from json'
    )
    group.add_argument(
        '--split_sentences',
        action='store_true',
        help='Split documents into sentences.')

    group = parser.add_argument_group(title='chinese words')
    group.add_argument(
        '--chinese',
        action='store_true',
        help="Is corpus need words segmentation step for chinese words.")
    group.add_argument(
        '--cn_whole_word_segment',
        action='store_true',
        help="Is corpus need words segmentation step for chinese words WWM.")
    group.add_argument(
        '--cn_seg_func',
        type=str,
        default='jieba',
        choices=['lac', 'seg', 'jieba'],
        help='Words segment function for chinese words.')
    group.add_argument(
        '--cn_splited',
        action='store_true',
        help="Is chinese corpus is splited in to words.")
    group.add_argument(
        '--cn_split_dimer',
        type=str,
        default=' ',
        help="Split dimer between chinese words.")

    group = parser.add_argument_group(title='common config')
    group.add_argument(
        '--append_eos',
        action='store_true',
        help='Append an <eos> token to the end of a document.')
    group.add_argument(
        '--log_interval',
        type=int,
        default=100,
        help='Interval between progress updates')
    group.add_argument(
        '--workers',
        type=int,
        default=1,
        help='Number of worker processes to launch')

    args = parser.parse_args()
    return args


def lexical_analysis_fn():
    from LAC import LAC
    lac = LAC(mode="lac")

    def process(line):
        words, _ = lac.run(line)
        return words

    return process


def chinese_segmentation_fn():
    from LAC import LAC
    lac_cws = LAC(mode='seg')

    def process(line):
        words = lac_cws.run(line)
        return words

    return process


def jieba_segmentation_fn():
    import jieba

    def process(line):
        words = jieba.cut(line)
        return list(words)

    return process


CHINESE_SEG_FUNC = {
    'lac': lexical_analysis_fn(),
    'seg': chinese_segmentation_fn(),
    'jieba': jieba_segmentation_fn(),
}


def get_whole_word_mask_tokens(tokens, words, max_word_length=6):
    """
    Do whole word mask on Chinese word.
    First, we do Chinese word segmentation on the sequence of tokens, which are from the WordPiece tokenization.
    Then, we add the '##' mark on chinese characters which are in the middle of Chinese words.
    And if the tokens are not chinese characters, we just exploit the results of WordPiece tokenization as words.
    Such as, 
         - text line : 通过利用mercer核，将样本从输入空间映射到高维特征空间，使原来没有显现的特征突现出来，取得了很好的图像分割效果。
         - the input tokens (after WordPiece): 
            ['通', '过', '利', '用', 'me', '##rc', '##er', '核', '，', '将', '样', '本', '从', '输', '入', '空', '间', '映', 
            '射', '到', '高', '维', '特', '征', '空', '间', '，', '使', '原', '来', '没', '有', '显', '现', '的', '特', '征', 
            '突', '现', '出', '来', '，', '取', '得', '了', '很', '好', '的', '图', '像', '分', '割', '效', '果', '。']
        - the Chinese words (after Chinese word segmentation like jieba)
            ['通过', '利用', 'mercer', '核', '，', '将', '样本', '从', '输入', '空间', '映射', '到', '高维', '特征', 
            '空间', '，', '使', '原来', '没有', '显现', '的', '特征', '突现', '出来', '，', '取得', '了', '很', '好', 
            '的', '图像', '分割', '效果', '。']
        - the output whole word mask tokens:
            ['通', '##过', '利', '##用', 'me', '##rc', '##er', '核', '，', '将', '样', '##本', '从', '输', '##入', 
            '空', '##间', '映', '##射', '到', '高', '##维', '特', '##征', '空', '##间', '，', '使', '原', '##来', 
            '没', '##有', '显', '##现', '的', '特', '##征', '突', '##现', '出', '##来', '，', '取', '##得', '了', 
            '很', '好', '的', '图', '##像', '分', '##割', '效', '##果', '。']

    Args:
        tokens(list(str)): The sequence of tokens, which are from the WordPiece tokenization.
        words(list(str)): The sequence of Chinese words.
        max_word_length(int, optional): 
            The maximum chinese character in Chinese words. It avoids too long Chinese word to be masked.
            Defaults as 4.

    Returns:
         new_tokens(list(str)): The new token will be done with whole word masking strategy.

    """

    new_tokens = []
    # opt for long document
    words_set = set(words)
    i = 0
    while i < len(tokens):
        # non-chinese character, then do word piece
        if len(re.findall('[\u4E00-\u9FA5]', tokens[i])) == 0:
            new_tokens.append(tokens[i])
            i += 1
            continue

        # add "##" mark on the middel tokens of Chinese words
        # such as ["通过", "利用"] -> ["通", "##过"， "利", "##用"]
        has_add = False
        for length in range(max_word_length, 0, -1):
            if i + length > len(tokens):
                continue
            if ''.join(tokens[i:i + length]) in words_set:
                new_tokens.append(tokens[i])
                for l in range(1, length):
                    new_tokens.append('##' + tokens[i + l])
                i += length
                has_add = True
                break

        if not has_add:
            new_tokens.append(tokens[i])
            i += 1
    return new_tokens


class IdentitySplitter(object):
    def tokenize(self, *text):
        return text


class NewlineSplitter():
    def tokenize(self, text):
        return text.split("\n")


class Converter(object):
    def __init__(self, args):
        self.args = args

    def initializer(self):
        Converter.tokenizer = getattr(
            tfs,
            self.args.tokenizer_name).from_pretrained(self.args.model_name)
        if self.args.cn_whole_word_segment:
            # Extend chinese char vocab for ErnieTokinzer
            Converter.tokenizer.extend_chinese_char()

        # Split document to sentence.
        if self.args.split_sentences:
            if self.args.chinese:
                Converter.splitter = NewlineSplitter()
            else:
                if not nltk_available:
                    print("NLTK is not available to split sentences.")
                    exit()
                splitter = nltk.load("tokenizers/punkt/english.pickle")
                Converter.splitter = splitter
        else:
            Converter.splitter = IdentitySplitter()

        # Split sentence whole words mask for chinese
        if self.args.cn_whole_word_segment:
            if self.args.cn_splited:
                Converter.segment_func = lambda text: text.split(self.args.cn_split_dimer)
            else:
                Converter.segment_func = CHINESE_SEG_FUNC[
                    self.args.cn_seg_func]
            Converter.whole_word_mask = get_whole_word_mask_tokens
        else:
            Converter.segment_func = lambda x: x
            Converter.whole_word_mask = lambda x, y: x

        def process(text):
            words = Converter.segment_func(text)
            # if there are two empty word, the should a split dimer in the pos
            if self.args.cn_splited:
                pre_dimer = False
                for index, w in enumerate(words):
                    if pre_dimer and len(w) == 0:
                        words[index] = self.args.cn_split_dimer
                        pre_dimer = False
                    elif len(w) == 0:
                        pre_dimer = True
                    else:
                        pre_dimer = False

            tokens = Converter.tokenizer.tokenize("".join(words))
            tokens = Converter.whole_word_mask(tokens, words)
            tokens = Converter.tokenizer.convert_tokens_to_ids(tokens)
            return tokens

        Converter.process = process

    def encode(self, json_line):
        text = json.loads(json_line)[self.args.json_key]
        doc_ids = []
        for sentence in Converter.splitter.tokenize(text):
            sentence_ids = Converter.process(sentence.strip())
            if len(sentence_ids) > 0:
                doc_ids.append(sentence_ids)

        if len(doc_ids) > 0 and self.args.append_eos:
            doc_ids[-1].append(Converter.tokenizer.eos_token_id)

        return doc_ids, len(text.encode("utf-8"))


def main():
    args = get_args()

    file_paths = []
    if os.path.isfile(args.input_path):
        file_paths.append(args.input_path)
    else:
        for root, _, fs in os.walk(args.input_path):
            for f in fs:
                file_paths.append(os.path.join(root, f))
    convert = Converter(args)

    # Try tokenizer is availiable
    sample_tokenizer = getattr(
        tfs, args.tokenizer_name).from_pretrained(args.model_name)
    if sample_tokenizer.vocab_size < 2**16 - 1:
        save_dtype = np.uint16
    else:
        save_dtype = np.int32

    pool = multiprocessing.Pool(args.workers, initializer=convert.initializer)

    # We use BytesIO to store the ids.
    token_ids_stream = io.BytesIO()
    sentlens_stream = io.BytesIO()
    # # Cumsum on tokens num
    # sent_cumsum_stream = io.BytesIO()
    # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))
    # Cunsum on document on every sentence num, type=np.int64
    doc_cumsum_stream = io.BytesIO()
    doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))

    sent_count = 0
    # token_count = 0

    file_paths.sort()

    step = 0
    total_bytes_processed = 0
    startup_start = time.time()
    for file_path in tqdm(file_paths):
        if file_path.endswith(".zst"):
            import zstandard
            cctx = zstandard.ZstdDecompressor()
            fh = open(file_path, 'rb')
            text = io.BufferedReader(cctx.stream_reader(fh))
        elif file_path.endswith(".jsonl"):
            text = open(file_path, 'r', encoding='utf-8')
        else:
            print("Unexpected data format, skiped %s" % file_path)
            continue

        encoded_docs = pool.imap(convert.encode, text, 256)
        print("Processing %s" % file_path)
        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
            step += 1
            total_bytes_processed += bytes_processed
            if len(doc) == 0:
                continue

            for sentence in doc:
                sentence_len = len(sentence)
                if sentence_len == 0:
                    continue
                sentlens_stream.write(
                    sentence_len.to_bytes(
                        4, byteorder='little', signed=True))
                # token_count += sentence_len
                # sent_cumsum_stream.write(
                #     token_count.to_bytes(
                #         8, byteorder='little', signed=True))
                sent_count += 1
                token_ids_stream.write(
                    np.array(
                        sentence, dtype=save_dtype).tobytes(order='C'))

            doc_cumsum_stream.write(
                sent_count.to_bytes(
                    8, byteorder='little', signed=True))

            if step % args.log_interval == 0:
                current = time.time()
                elapsed = current - startup_start
                mbs = total_bytes_processed / elapsed / 1024 / 1024
                print(
                    f"Processed {step} documents",
                    f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).",
                    file=sys.stderr)

    pool.close()
    print("Saving tokens to files...")
    all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype)
    lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32)
    # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64)
    docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64)
    np.save(args.output_prefix + "_ids.npy", all_doc_ids)
    # np.savez(args.output_prefix + "_idx.npz", lens=lens, sents=sents, docs=docs)
    np.savez(args.output_prefix + "_idx.npz", lens=lens, docs=docs)

    print("Total sentences num: %d" % len(lens))
    print("Total documents num: %d" % (len(docs) - 1))
    print("Total tokens num: %d" % len(all_doc_ids))
    print("Average tokens per sentence: %.2f" % (len(all_doc_ids) / len(lens)))
    print("Average tokens per document: %.2f" % (len(all_doc_ids) /
                                                 (len(docs) - 1)))


if __name__ == "__main__":
    main()


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpus2020.md
================================================
## CLUECorpus2020 语料

| 名称 | 文本类型 | 纯文本大小 |
|-|-|-|
| CLUECorpus2020| 中文 | 200GB |

CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：

> 数据下载
> 申请方式： 将使用语料研究目的和用途，计划、研究机构和申请者介绍，发送到邮箱，并承诺不向第三方提供。
>
> 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpusSmall.md
================================================
# CLUECorpusSmall

| 名称 | 文本类型 | 纯文本大小 |
|-|-|-|
| CLUECorpusSmall| 中文 | 14GB |

**数据集简介**：可用于语言建模、预训练或生成型任务等，数据量超过14G，近4000个定义良好的txt文件、50亿个字。主要部分来自于nlp_chinese_corpus项目
包含如下子语料库（总共14G语料）：新闻语料[news2016zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/6bac09db4e6d4857b6d680d34447457490cb2dbdd8b8462ea1780a407f38e12b?responseContentDisposition=attachment%3B%20filename%3Dnews2016zh_corpus.zip)， 社区互动语料[webText2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/83da03f7b4974871a52348b41c16c7e3b34a26d5ca644f558df8435be4de51c3?responseContentDisposition=attachment%3B%20filename%3DwebText2019zh_corpus.zip)，维基百科语料[wiki2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/d7a166408d8b4ffdaf4de9cfca09f6ee1e2340260f26440a92f78134d068b28f?responseContentDisposition=attachment%3B%20filename%3Dwiki2019zh_corpus.zip)，评论数据语料[comment2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/b66ddd445735408383c42322850ac4bb82faf9cc611447c2affb925443de7a6d?responseContentDisposition=attachment%3B%20filename%3Dcomment2019zh_corpus.zip)。

## 数据获取

用户可以通过官方github网页下载，https://github.com/CLUEbenchmark/CLUECorpus2020 。同时，为方便用户，我们也提供了aistudio数据集下载地址。[part1](https://aistudio.baidu.com/aistudio/datasetdetail/60598)，[part2](https://aistudio.baidu.com/aistudio/datasetdetail/124357)。使用aistudio版本的数据，下载好后，可以核对md5值：
```shell
> md5sum ./*
 8a8be341ebce39cfe9524fb0b46b08c5  ./comment2019zh_corpus.zip
 4bdc2c941a7adb4a061caf273fea42b8  ./news2016zh_corpus.zip
 fc582409f078b10d717caf233cc58ddd  ./webText2019zh_corpus.zip
 157dacde91dcbd2e52a60af49f710fa5  ./wiki2019zh_corpus.zip
```
解压文件
```shell
unzip comment2019zh_corpus.zip -d  clue_corpus_small_14g/comment2019zh_corpus
unzip news2016zh_corpus.zip    -d  clue_corpus_small_14g/news2016zh_corpus
unzip webText2019zh_corpus.zip -d  clue_corpus_small_14g/webText2019zh_corpus
unzip wiki2019zh_corpus.zip    -d  clue_corpus_small_14g/wiki2019zh_corpus
```
将txt文件转换为jsonl格式
```
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl
```
现在我们得到了jsonl格式的数据集。

## ERNIE 中文预训练数据制作

下面是针对训练任务的数据集应用，此处以ernie为例。

```
python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \
    --model_name ernie-1.0-base-zh \
    --tokenizer_name ErnieTokenizer \
    --input_path clue_corpus_small_14g.jsonl \
    --split_sentences \
    --chinese \
    --cn_whole_word_segment \
    --cn_seg_func jieba \
    --output_prefix clue_corpus_small_14g_20220104 \
    --workers 48 \
    --log_interval 10000
```

- model_name 可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`
- workers 表示转化的线程数目

数据共有文档`15702702`条左右，由于分词比较耗时，大概一小时左右可以完成。在当前目录下产出训练所需数据。
```
clue_corpus_small_14g_20220104_ids.npy
clue_corpus_small_14g_20220104_idx.npz
```
用户可以使用此数据进行预训练任务。


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/OpenWebText2.md
================================================
# OpenWebText2

| 名称 | 文本类型 | 纯文本大小 |
|-|-|-|
| OpenWebText2 | 英文 | 70GB |

## 数据获取

[OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/)是一个开源的英文网页文本数据集，数据来源于Reddit，经过去重、清洗、提取，最终包含800多万个文档。
本示例采用EleutherAI清洗好的[OpenWebText2数据](https://openwebtext2.readthedocs.io/en/latest/index.html#download-plug-and-play-version)

下载以后通过以下命令解压：

```shell
wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar
tar -xvf openwebtext2.json.zst.tar -C  /path/to/openwebtext
```

## GPT训练数据制作

然后使用[proprecess]](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/ernie/preprocess) 工具下的`create_pretraining_data.py`脚本进行数据集制作：
```
python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \
    --model_name gpt2-en \
    --tokenizer_name GPTTokenizer \
    --data_format JSON \
    --input_path /path/to/openwebtext/ \
    --append_eos \
    --output_prefix gpt_openwebtext  \
    --workers 40 \
    --log_interval 10000
```
处理时间约一个小时左右，就可以得到我们需要的`gpt_openwebtext_ids.npy`, `gpt_openwebtext_idx.npz`数据集文件。

为了方便用户运行测试本模型，本项目提供了处理好的300M的训练样本：
```shell
wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
```

将所有预处理得到的文件统一放入一个文件夹中，以备训练使用：

```
mkdir data
mv gpt_en_dataset_300m_ids.npy ./data
mv gpt_en_dataset_300m_idx.npz ./data
```


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/WuDaoCorpusBase.md
================================================
# WuDaoCorpus2.0 Base 语料


| 名称 | 文本类型 | 纯文本大小 |
|-|-|-|
| WuDaoCorpus2.0 Base| 中文 | 200GB |

WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。

## 数据获取

**1. 下载解压**

用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB。解压
```
unrar x WuDaoCorpus2.0_base_200G.rar
```
**2. 语料分词**

由于WuDao数据集比较大，分词比较耗时，这里先进行了语料分词：
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python ./ppfleetx/data/data_tools/ernie/preprocess/words_segmentation.py \
    --input_path ./WuDaoCorpus2.0_base_200G \
    --workers 40  \
    --data_format wudao \
    --cn_seg_func seg \
    --output_path ./wudao_lac_cut \
```

注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。

**3. 转换为jsonl格式**

文本转化完成后。我们使用 `ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  \
    --input_path ./wudao_lac_cut \
    --output_path wudao_corpus_200g_0623.jsonl \
    --workers 40
```
在当前目录下产出数据`wudao_corpus_200g_0623.jsonl`。格式如下：
```
{"text": "主持人 : 作为 一个 曲线救国 的 路线 我们 没 办法 。\n金鑫 : 考试 和 分数 只是 一个 阶段性 的 评价 手段 , 不是 目的 , 就 像 人 活着 的 目的 不是 为了 吃饭 , 吃饭 是 为了 让 我们 活下去 , 我们 学习 的 目的 不是 为了 考试 , 不是 为了 那个 分数 , 而是 我 掌握 了 知识 , 成为 我 内在 的 能力 , 将来 我 去 创作 创造 工作 , 我能 把 它 做 得 更好 。\n主持人 : 特别感谢 金总 今天 接受 我 的 访谈 , 也 让 我 从 别的 层面 看到 了 一对一 到底 存在 的 道理 是 什么 , 并且 能 发展 那么 好 的 原因 在 哪里 。\n在 节目 后 您 谈谈 您 对 一对一 未来 的 希望 , 包括 您 对 它 未来 的 设想 是 什么 ？\n金鑫 : 一对一 个性化 教育 现在 还是 在 初级阶段 , 如果 是 四个 阶段 的话 , 现在 还是 在 第一阶段 到 第二阶段 迈进 的 , 学大 在 这方面 我们 希望 能 做 得 更 快 更 远 一些 。\n将来 个性化 教育 一定 是 能够 帮助 学生 在 成绩 上 的 提升 , 能够 更好 的 成长 , 进而 成为 对 社会 对 国家 更 有用 的 人才 , 就是 我们 的 成绩 、 成长 、 成才 。\n学大 1 对 1 教育 的 教师 团队 由 各科 优秀教师 、 考试 指导 专家 、 心理 辅导 专家 及 学习 方法 指导 专家 组成 , 同时 配备 专职 班主任 及 学习 监管 师 , 全方位 辅导   顺利 而 有序 的 运作 。\n其中 部分 教师 担任 多年 毕业班 教学 工作 , 多次 参与 中 考试 命题 研究 及 阅卷 工作 , 深谙 中 考试 精髓 , 能够 在 短 的 时间 内 引领 学生 掌握 中 考试 知识   重点 , 快速 提分 。\n■   对于 成绩 差 的 学生 : 注重 学生 基础知识 , 力求 让 学生 在 基础 中 找 自信 , 在 自信 中 提升 ；\n注重 主观题 的 解题 方法 及 思路 , 以此 来 加强 对 基础知识 的 运用 。\n■   对于 成绩 需要 拔高 的 学生 : 找出 学生 弱点 , 加强 基础 , 重点 提高 弱势 项目 。\n"}
{"text": "武田信玄 是 天生 的 武将 , 一生 开拓 了 八十五万 石至 九十余万 石之多 的 领地 。\n武田信玄  他 21 岁 时 流放 自己 的 父亲 武田信虎  至骏河 , 避免 父亲 传位 给 弟弟 , 从而 登上 了 第 19 代家督 之位 。\n他 将 信 浓国 ( 现 长野县 ) 纳入 控制 范围 后 , 又 与 当时 的 豪强 今井氏 、 北条 氏 结成 三国 军事同盟 , 与 上 杉谦信 在 川 中岛 前后 展开 了 五次 大战 。\n武田信玄  勇于 进攻 。\n他 连续 攻打 邻国 , 扩大 自己 势力范围 , 可称 遇神 杀神 , 遇佛 杀佛 。\n他 不仅 流放 了 自己 的 父亲 , 连 自己 的 嫡子 武田义信 因 与 他 在 战略 方向 上 相左 , 也 被 他 幽禁 于 佛寺 , 随即 被迫 自杀 。\n武田信玄  虽然 是 战国 武将 中 的 最强者 , 但 他 的 弱点 是 年龄 。\n信玄比 织田信长 年长 13 岁 , 比上 杉谦信 年长 9 岁 。\n当信 玄年 届 五十 之 时 , 信长 和 谦信 犹 在 壮年 。\n上杉谦信 而且 , 武田信玄  虽 驰骋 天下 , 却 未率 军 进过 京都 , 而 织田信长 在 永禄 十一年 ( 1568 年 ) 就 以 拥立 第 15 代 将军 足利义 昭 为名 率兵 上洛 了 。\n所谓 \" 制 京都 者 得 天下 \" , 所以 , 想要 一统天下 , 武田信玄  的 时间 很 紧迫 。\n元龟 三年 ( 1572 年 ) , 武田信玄  与 室 町 幕府 第 15 代 将军 足利义 昭 、 本愿 寺 显如 , 以及 浅井 氏 、 朝仓氏 等 反 织田信长 实力 组成 联盟 , 编织 \" 反信长 包围圈 \" 。\n同年 10 月 3 日 , 武田信玄  率领 大军 , 开始 了 第一次 上洛之行 。\n是 年 , 信玄 52 岁 , 这 也许 是 他 统一天下 的 最后 一次 机会 。\n武田信玄 所 率领 的 是 当时 战国 最强 的 3 万甲州 精兵 。\n打着 \" 风林火山 \" 的 旗帜 , 武田军 第一站 就 到达 了 织田信长 的 同盟 德川家康  所在 的 三河 远江 。\n织田信长 德川家康  的 军队 在 甲州 精兵 之前 显得 不堪一击 , 到 了 10 月 13 日 , 只来 成 、 天 方城 、 一 宫城 、 饭田 城 、 各和城 、 向 笠 城 等 城池 纷纷 被 攻陷 。\n德川家康  见势不妙 , 决定 在 浜松 城中 闭门不出 。\n但是 武田信玄  毫不 松懈 , 又 将 家康 在 远江 地区 的 重要 据点 二俣城 攻破 。\n德川家康  集合 所有 军队 共 1 万 1 千人 , 出城 与 信玄 决一死战 , 但 大败 而 还 , 险些 失 了 性命 。\n这次 战争 被 称为 \" 三方 原战 \" , 德川家康  曾经 承认 这次 战争 是 他 生平 最大 的 失败 。\n"}
```

## ERNIE 中文预训练数据制作

下面是针对训练任务的数据集应用，此处以ernie为例。

```
python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \
    --model_name ernie-1.0-base-zh \
    --tokenizer_name ErnieTokenizer \
    --input_path wudao_corpus_200g_0623.jsonl \
    --split_sentences \
    --chinese \
    --cn_whole_word_segment \
    --cn_seg_func jieba \
    --cn_splited \
    --output_prefix wudao_corpus_200g_0623 \
    --workers 48 \
    --log_interval 10000
```

- 我们提前分词好了，所以加上了 `cn_splited`，否则不需要使用此选项。
- model_name 可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`
- workers 表示转化的线程数目

在当前目录下产出训练所需数据。
```
wudao_corpus_200g_0623_ids.npy
wudao_corpus_200g_0623_idx.npz
```
用户可以使用此数据进行预训练任务。


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
import argparse
import json
import multiprocessing
import sys
import time
import shutil
from functools import partial

import numpy as np
from tqdm import tqdm


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_path',
        type=str,
        required=True,
        help='Path to you raw files. Folder or file path.')
    parser.add_argument(
        '--output_path',
        type=str,
        required=True,
        help='Path to save the output json files.')
    parser.add_argument(
        '--json_key',
        type=str,
        default='text',
        help='The content key of json file.')
    parser.add_argument(
        '--doc_spliter',
        type=str,
        default='',
        help="Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank."
    )
    parser.add_argument(
        '--min_doc_length',
        type=int,
        default=10,
        help="Minimal char of a documment.")
    parser.add_argument(
        '--workers',
        type=int,
        default=1,
        help='Number of worker processes to launch')
    parser.add_argument(
        '--log_interval',
        type=int,
        default=1,
        help='Interval between progress updates.')
    parser.add_argument(
        '--no-merge', action='store_true', help='Don\'t merge the file.')
    parser.add_argument(
        '--no-shuffle', action='store_true', help='Don\'t shuffle the file.')
    args = parser.parse_args()
    return args


def raw_text_to_json(path, doc_spliter="", json_key="text", min_doc_length=10):
    path = os.path.abspath(path)
    if not os.path.exists(path):
        print("No found file %s" % path)
        return 0, None

    out_filepath = path + ".jsonl"
    fout = open(out_filepath, "w", encoding="utf-8")
    len_files = 0
    with open(path, "r") as f:
        doc = ""
        line = f.readline()
        while line:
            len_files += len(line)
            if line.strip() == doc_spliter:
                if len(doc) > min_doc_length:
                    fout.write(
                        json.dumps(
                            {
                                json_key: doc
                            }, ensure_ascii=False) + "\n")
                doc = ""
            else:
                doc += line
            line = f.readline()

        if len(doc) > min_doc_length:
            fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + "\n")
        doc = ""

    return len_files, out_filepath


def merge_file(file_paths, output_path):
    if not output_path.endswith(".jsonl"):
        output_path = output_path + ".jsonl"
    print("Merging files into %s" % output_path)
    with open(output_path, 'wb') as wfd:
        for f in file_paths:
            if f is not None and os.path.exists(f):
                with open(f, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd)
                os.remove(f)
    print("File save in %s" % output_path)
    return output_path


def shuffle_file(output_path):
    print("Shuffling the jsonl file...")
    if os.path.exists(output_path):
        os.system("shuf %s -o %s" % (output_path, output_path))
        print("File shuffled!!!")
    else:
        raise ValueError("File not found: %s" % output_path)


def main():
    args = get_args()
    startup_start = time.time()

    file_paths = []
    if os.path.isfile(args.input_path):
        file_paths.append(args.input_path)
    else:
        for root, _, fs in os.walk(args.input_path):
            for f in fs:
                file_paths.append(os.path.join(root, f))

    pool = multiprocessing.Pool(args.workers)

    startup_end = time.time()
    proc_start = time.time()
    total_bytes_processed = 0
    print("Time to startup:", startup_end - startup_start)

    trans_json = partial(
        raw_text_to_json,
        doc_spliter=args.doc_spliter,
        json_key=args.json_key,
        min_doc_length=args.min_doc_length)
    encoded_files = pool.imap(trans_json, file_paths, 1)

    out_paths = []
    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
        total_bytes_processed += bytes_processed
        out_paths.append(out_path)
        master_start = time.time()

        if i % args.log_interval == 0:
            current = time.time()
            elapsed = current - proc_start
            mbs = total_bytes_processed / elapsed / 1024 / 1024
            print(
                f"Processed {i} files",
                f"({i/elapsed} files/s, {mbs} MB/s).",
                file=sys.stderr)

    if not args.no_merge:
        output_path = merge_file(out_paths, args.output_path)
        if not args.no_shuffle:
            shuffle_file(output_path)


if __name__ == "__main__":
    main()
    #profile.run("main()", "testprof")


================================================
FILE: ppfleetx/data/data_tools/ernie/preprocess/words_segmentation.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import re
import argparse
import multiprocessing
import os
import time
import jieba
import sys
from functools import partial


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_path',
        type=str,
        required=True,
        help='Path to you raw files. Folder or file path.')
    parser.add_argument(
        '--workers',
        type=int,
        default=1,
        help='Number of worker processes to launch')
    parser.add_argument(
        '--output_path',
        type=str,
        default="./tmp",
        help='Path to save the output json files.')
    parser.add_argument(
        '--data_format',
        type=str,
        default="jsonl",
        choices=["jsonl", "wudao"],
        help='Path to you raw files. Folder or file path.')
    parser.add_argument(
        '--cn_seg_func',
        type=str,
        default='jieba',
        choices=['lac', 'seg', 'jieba'],
        help='Words segment function for chinese words.')
    parser.add_argument(
        '--log_interval',
        type=int,
        default=1,
        help='Interval between progress updates.')
    args = parser.parse_args()
    return args


def lexical_analysis_fn():
    from LAC import LAC
    lac = LAC(mode="lac")

    def process(line):
        words, _ = lac.run(line)
        return words

    return process


def chinese_segmentation_fn():
    from LAC import LAC
    lac_cws = LAC(mode='seg')

    def process(line):
        words = lac_cws.run(line)
        return words

    return process


def jieba_segmentation_fn():
    import jieba

    def process(line):
        words = jieba.cut(line)
        return list(words)

    return process


CHINESE_SEG_FUNC = {
    'lac': lexical_analysis_fn(),
    'seg': chinese_segmentation_fn(),
    'jieba': jieba_segmentation_fn(),
}


def read_wudao(path):
    print("Loading %s" % path)
    with open(path, "r") as f:
        try:
            contents = json.load(f)
        except Exception as e:
            print("Failed to load %s" % path)
            raise StopIteration
    for js in contents:
        yield js["content"]


def read_jsonl(path):
    print("Loading %s" % path)
    with open(path, "r") as f:
        line = f.readline()
        while line:
            contents = json.load(f)
            yield contents["text"]
            line = f.readline()


READFILE_FUNC = {
    'jsonl': read_jsonl,
    'wudao': read_wudao,
}

special_chars = ['\n', '。', '?', '？', ' ', ';', '；', '！', '!']
split_chars = ['。', '?', '？', ';', '；', '!', '！']


def text_to_text(path, output_path, read_func, seg_func):
    out_name = os.path.join(output_path, path[-20:])

    print("Write into %s" % out_name)
    if os.path.exists(out_name):
        print("File exists %s" % out_name)
        return 0, None

    seg_func = CHINESE_SEG_FUNC[seg_func]
    read_func = READFILE_FUNC[read_func]

    import time
    s = time.time()
    data_len = 0
    count = 0
    with open(out_name, "w") as f:
        for text in read_func(path):
            # for js in contents:
            count += 1
            # text = js["content"]
            data_len += len(text.encode("utf-8"))
            # make special char only once,
            # because of those token will be treat as sentence spliter.
            # 此处为断句逻辑
            for char in special_chars:
                text = re.sub('[' + char + ']+[ ]*', char, text)
            for char in split_chars:
                text = text.replace(char, char + "\n")

            # 此处为分词逻辑
            final = ""
            for line in text.split("\n"):
                if len(line) == 0:
                    continue
                words = seg_func(line)
                final += " ".join(words) + "\n"
            f.write(final + "\n")

    return data_len, None


def main():
    args = get_args()
    startup_start = time.time()

    file_paths = []
    if os.path.isfile(args.input_path):
        file_paths.append(args.input_path)
    else:
        for root, _, fs in os.walk(args.input_path):
            for f in fs:
                file_paths.append(os.path.join(root, f))

    pool = multiprocessing.Pool(args.workers)

    startup_end = time.time()
    proc_start = time.time()
    total_bytes_processed = 0
    print("Time to startup:", startup_end - startup_start)

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    trans_func = partial(
        text_to_text,
        output_path=args.output_path,
        seg_func=args.cn_seg_func,
        read_func=args.data_format)

    encoded_files = pool.imap(trans_func, file_paths, 1)

    out_paths = []
    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
        total_bytes_processed += bytes_processed
        out_paths.append(out_path)
        master_start = time.time()

        if i % args.log_interval == 0:
            current = time.time()
            elapsed = current - proc_start
            mbs = total_bytes_processed / elapsed / 1024 / 1024
            print(
                f"Processed {i} files",
                f"({i/elapsed} files/s, {mbs} MB/s).",
                file=sys.stderr)
    pool.close()


if __name__ == "__main__":
    main()


================================================
FILE: ppfleetx/data/data_tools/gpt/README.md
================================================
## GPT 模型预训练数据准备流程(中文数据处理正在支持中)

我们将预训练数据过程划分为以下2个部分：

1. 原始数据转换，原始文本转换为jsonl的json字符串格式。
2. 数据ID化，断句、分词、tokenize转化为token id格式。

本目录下主要包含以下文件：
```
├── preprocess_data.py # 将jsonl文本，断句、分词后，tokenizer转化为token id。
├── README.md # 预训练数据准备流程教程
└── raw_trans_to_json.py # 原始文本数据转化的脚本，将数据转化为json串格式。
```

## 目录切换
```
# 如果您还未下载 PaddleFleetX 套件，请先 clone 套件
# git clone https://github.com/PaddlePaddle/PaddleFleetX.git
cd PaddleFleetX

# 以下所有命令都在 PaddleFleetX 根目录中执行
```

## 环境依赖

 - paddlepaddle-gpu>=2.3.0
 - python==3.7
 - tqdm==4.54.1
 - numpy==1.20.1
 - pybind11==2.10.0

安装命令`pip install -r requirements.txt`。


## 训练全流程数据 Pipeline

|步骤|阶段|数据格式| 样例|
|-|-|-|-|
| 原始数据清洗 | 原始数据准备|原始数据： <br/> 每个doc之间用空行间隔开 <br/> - 中文，默认每句换行符，作为句子结束。<br/> - 英文，默认使用nltk判断句子结束。doc是又一段或多端文字组成，每段文字由一句或多句话文字组成。  | ```飞桨是功能完备、开源开放的产业级深度学习平台。``` <br/> ```飞桨拥有核心训练和推理框架、基础模型库。``` <br/><br/> ```PaddleNLP是自然语言处理领域的优秀工具。```  |
|原始数据转换<br/>`raw_trans_to_json.py`|预处理|jsonl格式：每个doc对应一行json字符串| ```{"text": "飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有..."}```<br/>```{"text": "PaddleNLP是自然语言..."}```
|数据ID化<br/>`preprocess_data.py`|预处理| npy格式：数据id化后的token id <br/>npz格式：数据句子、文章位置索引 | -


## 全流程示例

下面以 GPT 预训练为例，简要介绍一下预训练数据处理的全流程。

### 原始数据
首先下载样例数据：
```
mkdir -p dataset/wikitext_103_en
wget -O dataset/wikitext_103_en/wikitext-103-en.txt http://fleet.bj.bcebos.com/datasets/gpt/wikitext-103-en.txt
```
### 原始数据转换 jsonl 格式
使用`raw_trans_to_json.py`转化为json串格式，下面是脚本的使用说明
```
optional arguments:
  -h, --help            show this help message and exit
  --input_path INPUT_PATH
                        Path to you raw files. Folder or file path.
                        必须设置，可以是文件夹或者单个文件。文件夹中的目录默认最多搜索两层子目录。
  --output_path OUTPUT_PATH
                        Path to save the output json files.
                        必须设置，输出文件的名字。
  --json_key JSON_KEY   The content key of json file.
                        建议不修改，默认的key是text
  --doc_spliter DOC_SPLITER
                        Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank.
                        根据实际情况修改，默认空行作为文章换行符。
  --min_doc_length MIN_DOC_LENGTH
                        Minimal char of a documment.
                        可选。过滤掉长度多短的文章，默认值10
  --workers WORKERS     Number of worker processes to launch
                        可选。多进程转化文件，适用于 input_path 中包含的文件数据较多的情况。每个文件，分配给不同worker处理
  --log_interval LOG_INTERVAL
                        Interval between progress updates.
                        可选。此处的interval是值处理完文件个数的间隔。
  --no-merge            Don't merge the file.
                        可选。默认不开启这个选项，默认每个文件转换的jsonl文本，会拼接成到同一个文件。
  --no-shuffle          Don't shuffle the file.
                        可选。默认不开启这个选项，默认对处理完进行shuffle。
```
根据说明，我们使用下面简单命令，可以得到`wikitext_103_en.jsonl`文件。此处，我们对所有doc进行了shuffle。
```shell
python ppfleetx/data/data_tools/gpt/raw_trans_to_json.py  --input_path ./dataset/wikitext_103_en --output_path ./dataset/wikitext_103_en/wikitext_103_en

# output of terminal
# Time to startup: 0.0075109004974365234
# Processed 1 files (0.12870440603278582 files/s, 64.80481421466284 MB/s).
# Merging files into wikitext_103_en.jsonl
# File save in wikitext_103_en.jsonl
# Shuffling the jsonl file...
# File shuffled!!!

# 查看数据。因为对数据有 shuffle，下面的内容可能会不一样。
tail -1 ./dataset/wikitext_103_en/wikitext_103_en.jsonl
{"text": "The album was released in June 1973 . Although it received good reviews , it did not sell well , except in Austin , where it sold more copies than earlier records by Nelson did nationwide . The recording led Nelson to a new style ; he later stated regarding his new musical identity that Shotgun Willie had \" cleared his throat . \" It became his breakthrough record , and one of the first of the outlaw movement , music created without the influence of the conservative Nashville Sound . The album — the first to feature Nelson with long hair and a beard on the cover — gained him the interest of younger audiences . It peaked at number 41 on Billboard 's album chart and the songs \" Shotgun Willie \" and \" Stay All Night ( Stay A Little Longer ) \" peaked at number 60 and 22 on Billboard Hot 100 respectively .\nRolling Stone wrote : \" With this flawless album , Willie Nelson finally demonstrates why he has for so long been regarded as a Country & Western singer @-@ songwriter 's singer @-@ songwriter ... At the age of 39 , Nelson finally seems destined for the stardom he deserves \" . Robert Christgau wrote : \" This attempt to turn Nelson into a star runs into trouble when it induces him to outshout Memphis horns or Western swing . \"\nBillboard wrote : \" This is Willie Nelson at his narrative best . He writes and sings with the love and the hurt and the down @-@ to @-@ earth things he feels , and he has a few peers . \" Texas Monthly praised Nelson and Wexler regarding the change in musical style : \" They 've switched his arrangements from Ray Price to Ray Charles — the result : a revitalized music . He 's the same old Willie , but veteran producer Jerry Wexler finally captured on wax the energy Nelson projects in person \" . School Library Journal wrote : \" Willie Nelson differs ( from ) rock artists framing their music with a country & western facade — in that he appears a honky @-@ tonk stardust cowboy to the core . This album abounds in unabashed sentimentalism , nasal singing , lyrics preoccupied with booze , religion , and love gone bad , and stereotyped Nashville instrumentation ( twangy steel guitars , fiddles , and a clean rhythm section characterized by the minimal use of bass drum and cymbals , both of which gain heavy mileage with rock performers ) .\nStephen Thomas Erlewine wrote in his review for Allmusic : \" Willie Nelson offered his finest record to date for his debut – possibly his finest album ever . Shotgun Willie encapsulates Willie 's world view and music , finding him at a peak as a composer , interpreter , and performer . This is laid @-@ back , deceptively complex music , equal parts country , rock attitude , jazz musicianship , and troubadour storytelling \" .\n"}
```

### 数据ID化
我们使用 `preprocess_data.py` 脚本将前面得到的 `wikitext_103_en.jsonl` 进行tokenize id化处理。
```
optional arguments:
  -h, --help            show this help message and exit
  --model_name MODEL_NAME
                        What model to use.
                        必须设置，如：gpt2
  --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer}
                        What type of tokenizer to use.
                        模型对应的tokenizer, 目前暂时只支持 Ernie，Bert，GPT
data input/output:
  --input_path INPUT_PATH
                        Path to input JSON files.
                        必须设置，输入文件jsonl的目录
  --output_prefix OUTPUT_PREFIX
                        Output prefix to store output file.
                        必须设置，输出文件的名称。
                        假设名称为XXX，则会输出 XXX_ids.npy, XXX_idx.npz 两个文件。
                        npy文件，数据id化后的token ids; npz文件，数据句子、文章位置索引。
  --data_format {JSON}  Only support json format for now. One document per line.
                        不需要设置。目前默认处理jsonl数据格式
  --json_key JSON_KEY   For JSON format. Space separate listed of keys to extract from json
                        文本串json的key值。同前面trans_to_json.py的json_key，默认text为key
  --split_sentences     Split documents into sentences.
                        是否需要将文章划分成句子。一般而言，GPT不需要，Bert/Ernie模型需要

chinese words:
  --chinese             Is corpus need words segmentation step for chinese words.
                        中文情形必须设置。处理的文本类型是否是中文。
  --cn_whole_word_segment
                        Is corpus need words segmentation step for chinese words WWM.
                        可选。是否需要WWM策略。一般而言，Bert/Ernie模型需要，GPT不需要。
  --cn_seg_func {lac,seg,jieba}
                        Words segment function for chinese words.
                        默认jieba，jieba速度较快，lac模型更准确，计算量高。
  --cn_splited          Is chinese corpus is splited in to words.
                        分词后的文本，可选。设置此选项则，cn_seg_func不起作用。
                        例如分词后文本串 "中国 效仿 西方 发展 工业 的过 程"
  --cn_split_dimer CN_SPLIT_DIMER
                        Split dimer between chinese words.
                        配合cn_splited使用，默认空格表示分词间隔。

common config:
  --append_eos          Append an <eos> token to the end of a document.
                        gpt模型专用，gpt设置此选项，表示doc结束。
  --log_interval LOG_INTERVAL
                        Interval between progress updates
                        打印日志间隔，interval表示处理 文本行数/doc数的 间隔。
  --workers WORKERS     Number of worker processes to launch
                        处理文本id化的进程个数。
```
通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`wikitext_103_en.npy`, 文章索引信息`wikitext_103_en.npz`.
在使用 `GPTTokenizer` 时需要用到 `gpt2-vocab.json` 与 `gpt2-merges.txt`，如果没有下载缓存过这两个文件，脚本会自动下载并缓存。当遇到网络问题时，可以自行下载并将这两个文件放置在 `~/.cache/ppfleetx/` 目录下。
``` 
python ppfleetx/data/data_tools/gpt/preprocess_data.py \
    --model_name gpt2 \
    --tokenizer_name GPTTokenizer \
    --data_format JSON \
    --input_path ./dataset/wikitext_103_en/wikitext_103_en.jsonl \
    --append_eos \
    --output_prefix ./dataset/wikitext_103_en/wikitext_103_en  \
    --workers 40 \
    --log_interval 1000
    
# 处理完后 terminal 输出
# Processed 267000 documents (9843.34 docs/s, 18.4880 MB/s).
# Processed 268000 documents (9869.46 docs/s, 18.5351 MB/s).
# 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:27<00:00, 27.17s/it]
# Saving tokens to files...
# Total sentences num: 268492
# Total documents num: 268492
# Total tokens num: 114130026
# Average tokens per sentence: 425.08
# Average tokens per document: 425.08
```

## 参考内容

注: 大部分数据流程，参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)和[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)，特此表达感谢。


================================================
FILE: ppfleetx/data/data_tools/gpt/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/data_tools/gpt/preprocess_data.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import io
import re
import argparse
import json
import multiprocessing
import sys
import time

import numpy as np
from tqdm import tqdm

try:
    from ppfleetx.data import tokenizers as tfs
except ImportError:
    __dir__ = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../')))
    from ppfleetx.data import tokenizers as tfs
    from ppfleetx.utils.log import logger

try:
    import nltk
    nltk_available = True
except ImportError:
    nltk_available = False

CHINESE_SEG_FUNC = {}


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name', type=str, required=True, help='What model to use.')
    parser.add_argument(
        '--tokenizer_name',
        type=str,
        required=True,
        choices=[
            'ErnieTokenizer', 'BertTokenizer', 'GPTTokenizer',
            'GPTChineseTokenizer', 'ElectraTokenizer'
        ],
        help='What type of tokenizer to use.')
    group = parser.add_argument_group(title='data input/output')
    group.add_argument(
        '--input_path',
        type=str,
        required=True,
        help='Path to input JSON files.')
    group.add_argument(
        '--output_prefix',
        type=str,
        required=True,
        help='Output prefix to store output file.')
    group.add_argument(
        '--data_format',
        type=str,
        default='text',
        choices=['JSON'],
        help='Only support json format for now. One document per line.')
    group.add_argument(
        '--json_key',
        type=str,
        default='text',
        help='For JSON format. Space separate listed of keys to extract from json'
    )
    group.add_argument(
        '--split_sentences',
        action='store_true',
        help='Split documents into sentences.')

    group = parser.add_argument_group(title='chinese words')
    group.add_argument(
        '--chinese',
        action='store_true',
        help="Is corpus need words segmentation step for chinese words.")
    group.add_argument(
        '--cn_whole_word_segment',
        action='store_true',
        help="Is corpus need words segmentation step for chinese words WWM.")
    group.add_argument(
        '--cn_seg_func',
        type=str,
        default='jieba',
        choices=['lac', 'seg', 'jieba'],
        help='Words segment function for chinese words.')
    group.add_argument(
        '--cn_splited',
        action='store_true',
        help="Is chinese corpus is splited in to words.")
    group.add_argument(
        '--cn_split_dimer',
        type=str,
        default=' ',
        help="Split dimer between chinese words.")

    group = parser.add_argument_group(title='common config')
    group.add_argument(
        '--append_eos',
        action='store_true',
        help='Append an <eos> token to the end of a document.')
    group.add_argument(
        '--log_interval',
        type=int,
        default=100,
        help='Interval between progress updates')
    group.add_argument(
        '--workers',
        type=int,
        default=1,
        help='Number of worker processes to launch')

    args = parser.parse_args()
    if args.chinese:
        global CHINESE_SEG_FUNC
        CHINESE_SEG_FUNC['lac'] = lexical_analysis_fn()
        CHINESE_SEG_FUNC['seg'] = chinese_segmentation_fn()
        CHINESE_SEG_FUNC['jieba'] = jieba_segmentation_fn()

    return args


def lexical_analysis_fn():
    from LAC import LAC
    lac = LAC(mode="lac")

    def process(line):
        words, _ = lac.run(line)
        return words

    return process


def chinese_segmentation_fn():
    from LAC import LAC
    lac_cws = LAC(mode='seg')

    def process(line):
        words = lac.run(line)
        return words

    return process


def jieba_segmentation_fn():
    import jieba

    def process(line):
        words = jieba.cut(line)
        return list(words)

    return process


def get_whole_word_mask_tokens(tokens, words, max_word_length=4):
    """
    Do whole word mask on Chinese word.
    First, we do Chinese word segmentation on the sequence of tokens, which are from the WordPiece tokenization.
    Then, we add the '##' mark on chinese characters which are in the middle of Chinese words.
    And if the tokens are not chinese characters, we just exploit the results of WordPiece tokenization as words.
    Such as, 
         - text line : 通过利用mercer核，将样本从输入空间映射到高维特征空间，使原来没有显现的特征突现出来，取得了很好的图像分割效果。
         - the input tokens (after WordPiece): 
            ['通', '过', '利', '用', 'me', '##rc', '##er', '核', '，', '将', '样', '本', '从', '输', '入', '空', '间', '映', 
            '射', '到', '高', '维', '特', '征', '空', '间', '，', '使', '原', '来', '没', '有', '显', '现', '的', '特', '征', 
            '突', '现', '出', '来', '，', '取', '得', '了', '很', '好', '的', '图', '像', '分', '割', '效', '果', '。']
        - the Chinese words (after Chinese word segmentation like jieba)
            ['通过', '利用', 'mercer', '核', '，', '将', '样本', '从', '输入', '空间', '映射', '到', '高维', '特征', 
            '空间', '，', '使', '原来', '没有', '显现', '的', '特征', '突现', '出来', '，', '取得', '了', '很', '好', 
            '的', '图像', '分割', '效果', '。']
        - the output whole word mask tokens:
            ['通', '##过', '利', '##用', 'me', '##rc', '##er', '核', '，', '将', '样', '##本', '从', '输', '##入', 
            '空', '##间', '映', '##射', '到', '高', '##维', '特', '##征', '空', '##间', '，', '使', '原', '##来', 
            '没', '##有', '显', '##现', '的', '特', '##征', '突', '##现', '出', '##来', '，', '取', '##得', '了', 
            '很', '好', '的', '图', '##像', '分', '##割', '效', '##果', '。']
    Args:
        tokens(list(str)): The sequence of tokens, which are from the WordPiece tokenization.
        words(list(str)): The sequence of Chinese words.
        max_word_length(int, optional): 
            The maximum chinese character in Chinese words. It avoids too long Chinese word to be masked.
            Defaults as 4.
    Returns:
         new_tokens(list(str)): The new token will be done with whole word masking strategy.
    """

    new_tokens = []
    # opt for long document
    words_set = set(words)
    i = 0
    while i < len(tokens):
        # non-chinese character, then do word piece
        if len(re.findall('[\u4E00-\u9FA5]', tokens[i])) == 0:
            new_tokens.append(tokens[i])
            i += 1
            continue

        # add "##" mark on the middel tokens of Chinese words
        # such as ["通过", "利用"] -> ["通", "##过"， "利", "##用"]
        has_add = False
        for length in range(max_word_length, 0, -1):
            if i + length > len(tokens):
                continue
            if ''.join(tokens[i:i + length]) in words_set:
                new_tokens.append(tokens[i])
                for l in range(1, length):
                    new_tokens.append('##' + tokens[i + l])
                i += length
                has_add = True
                break

        if not has_add:
            new_tokens.append(tokens[i])
            i += 1
    return new_tokens


class IdentitySplitter(object):
    def tokenize(self, *text):
        return text


class NewlineSplitter():
    def tokenize(self, text):
        return text.split("\n")


class Converter(object):
    def __init__(self, args):
        self.args = args

    def initializer(self):
        Converter.tokenizer = getattr(
            tfs,
            self.args.tokenizer_name).from_pretrained(self.args.model_name)

        # Split document to sentence.
        if self.args.split_sentences:
            if self.args.chinese:
                Converter.splitter = NewlineSplitter()
            else:
                if not nltk_available:
                    print("NLTK is not available to split sentences.")
                    exit()
                splitter = nltk.load("tokenizers/punkt/english.pickle")
                Converter.splitter = splitter
        else:
            Converter.splitter = IdentitySplitter()

        # Split sentence whole words mask for chinese
        if self.args.cn_whole_word_segment:
            if self.args.cn_splited:
                Converter.segment_func = lambda text: text.split(self.args.cn_split_dimer)
            else:
                Converter.segment_func = CHINESE_SEG_FUNC[
                    self.args.cn_seg_func]
            Converter.whole_word_mask = get_whole_word_mask_tokens
        else:
            Converter.segment_func = lambda x: x
            Converter.whole_word_mask = lambda x, y: x

        def process(text):
            words = Converter.segment_func(text)
            tokens = Converter.tokenizer.tokenize("".join(words))
            tokens = Converter.whole_word_mask(tokens, words)
            tokens = Converter.tokenizer.convert_tokens_to_ids(tokens)
            return tokens

        Converter.process = process

    def encode(self, json_line):
        text = json.loads(json_line)[self.args.json_key]
        doc_ids = []
        for sentence in Converter.splitter.tokenize(text):
            sentence_ids = Converter.process(sentence.strip())
            if len(sentence_ids) > 0:
                doc_ids.append(sentence_ids)

        if len(doc_ids) > 0 and self.args.append_eos:
            doc_ids[-1].append(Converter.tokenizer.eos_token_id)

        return doc_ids, len(text.encode("utf-8"))


def main():
    args = get_args()

    file_paths = []
    if os.path.isfile(args.input_path):
        file_paths.append(args.input_path)
    else:
        for root, _, fs in os.walk(args.input_path):
            for f in fs:
                file_paths.append(os.path.join(root, f))
    if len(file_paths) == 0:
        print("No input file found!")
        exit(-1)

    convert = Converter(args)

    # Try tokenizer is availiable
    sample_tokenizer = getattr(
        tfs, args.tokenizer_name).from_pretrained(args.model_name)
    if sample_tokenizer.vocab_size < 2**16 - 1:
        save_dtype = np.uint16
    else:
        save_dtype = np.int32

    pool = multiprocessing.Pool(args.workers, initializer=convert.initializer)

    # We use BytesIO to store the ids.
    token_ids_stream = io.BytesIO()
    sentlens_stream = io.BytesIO()
    # # Cumsum on tokens num
    # sent_cumsum_stream = io.BytesIO()
    # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))
    # Cunsum on document on every sentence num, type=np.int64
    doc_cumsum_stream = io.BytesIO()
    doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True))

    sent_count = 0
    # token_count = 0

    file_paths.sort()

    step = 0
    total_bytes_processed = 0
    startup_start = time.time()
    for file_path in tqdm(file_paths):
        if file_path.endswith(".zst"):
            import zstandard
            cctx = zstandard.ZstdDecompressor()
            fh = open(file_path, 'rb')
            text = io.BufferedReader(cctx.stream_reader(fh))
        elif file_path.endswith(".jsonl"):
            text = open(file_path, 'r', encoding='utf-8')
        else:
            print("Unexpected data format, skiped %s" % file_path)
            continue

        encoded_docs = pool.imap(convert.encode, text, 256)
        print("Processing %s" % file_path)
        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
            step += 1
            total_bytes_processed += bytes_processed
            if len(doc) == 0:
                continue

            for sentence in doc:
                sentence_len = len(sentence)
                if sentence_len == 0:
                    continue
                sentlens_stream.write(
                    sentence_len.to_bytes(
                        4, byteorder='little', signed=True))
                # token_count += sentence_len
                # sent_cumsum_stream.write(
                #     token_count.to_bytes(
                #         8, byteorder='little', signed=True))
                sent_count += 1
                token_ids_stream.write(
                    np.array(
                        sentence, dtype=save_dtype).tobytes(order='C'))

            doc_cumsum_stream.write(
                sent_count.to_bytes(
                    8, byteorder='little', signed=True))

            if step % args.log_interval == 0:
                current = time.time()
                elapsed = current - startup_start
                mbs = total_bytes_processed / elapsed / 1024 / 1024
                print(
                    f"Processed {step} documents",
                    f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).",
                    file=sys.stderr)

    pool.close()
    print("Saving tokens to files...")
    all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype)
    lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32)
    # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64)
    docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64)
    np.save(args.output_prefix + "_ids.npy", all_doc_ids)
    # np.savez(args.output_prefix + "_idx.npz", lens=lens, sents=sents, docs=docs)
    np.savez(args.output_prefix + "_idx.npz", lens=lens, docs=docs)

    print("Total sentences num: %d" % len(lens))
    print("Total documents num: %d" % (len(docs) - 1))
    print("Total tokens num: %d" % len(all_doc_ids))
    print("Average tokens per sentence: %.2f" % (len(all_doc_ids) / len(lens)))
    print("Average tokens per document: %.2f" % (len(all_doc_ids) /
                                                 (len(docs) - 1)))


if __name__ == "__main__":
    main()


================================================
FILE: ppfleetx/data/data_tools/gpt/raw_trans_to_json.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
import argparse
import json
import multiprocessing
import sys
import time
import shutil
from functools import partial

import numpy as np
from tqdm import tqdm


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_path',
        type=str,
        required=True,
        help='Path to you raw files. Folder or file path.')
    parser.add_argument(
        '--output_path',
        type=str,
        required=True,
        help='Path to save the output json files.')
    parser.add_argument(
        '--json_key',
        type=str,
        default='text',
        help='The content key of json file.')
    parser.add_argument(
        '--doc_spliter',
        type=str,
        default='',
        help="Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank."
    )
    parser.add_argument(
        '--min_doc_length',
        type=int,
        default=10,
        help="Minimal char of a documment.")
    parser.add_argument(
        '--workers',
        type=int,
        default=1,
        help='Number of worker processes to launch')
    parser.add_argument(
        '--log_interval',
        type=int,
        default=1,
        help='Interval between progress updates.')
    parser.add_argument(
        '--no-merge', action='store_true', help='Don\'t merge the file.')
    parser.add_argument(
        '--no-shuffle', action='store_true', help='Don\'t shuffle the file.')
    args = parser.parse_args()
    return args


def raw_text_to_json(path, doc_spliter="", json_key="text", min_doc_length=10):
    path = os.path.abspath(path)
    if not os.path.exists(path):
        print("No found file %s" % path)
        return 0, None

    out_filepath = path + ".jsonl"
    fout = open(out_filepath, "w", encoding="utf-8")
    len_files = 0
    with open(path, "r") as f:
        doc = ""
        line = f.readline()
        while line:
            len_files += len(line)
            if line.strip() == doc_spliter:
                if len(doc) > min_doc_length:
                    fout.write(
                        json.dumps(
                            {
                                json_key: doc
                            }, ensure_ascii=False) + "\n")
                doc = ""
            else:
                doc += line
            line = f.readline()

        if len(doc) > min_doc_length:
            fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + "\n")
        doc = ""

    return len_files, out_filepath


def merge_file(file_paths, output_path):
    if not output_path.endswith(".jsonl"):
        output_path = output_path + ".jsonl"
    print("Merging files into %s" % output_path)
    with open(output_path, 'wb') as wfd:
        for f in file_paths:
            if f is not None and os.path.exists(f):
                with open(f, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd)
                os.remove(f)
    print("File save in %s" % output_path)
    return output_path


def shuffle_file(output_path):
    print("Shuffling the jsonl file...")
    if os.path.exists(output_path):
        os.system("shuf %s -o %s" % (output_path, output_path))
        print("File shuffled!!!")
    else:
        raise ValueError("File not found: %s" % output_path)


def main():
    args = get_args()
    startup_start = time.time()

    file_paths = []
    if os.path.isfile(args.input_path):
        file_paths.append(args.input_path)
    else:
        for root, _, fs in os.walk(args.input_path):
            for f in fs:
                file_paths.append(os.path.join(root, f))

    pool = multiprocessing.Pool(args.workers)

    startup_end = time.time()
    proc_start = time.time()
    total_bytes_processed = 0
    print("Time to startup:", startup_end - startup_start)

    trans_json = partial(
        raw_text_to_json,
        doc_spliter=args.doc_spliter,
        json_key=args.json_key,
        min_doc_length=args.min_doc_length)
    encoded_files = pool.imap(trans_json, file_paths, 1)

    out_paths = []
    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
        total_bytes_processed += bytes_processed
        out_paths.append(out_path)
        master_start = time.time()

        if i % args.log_interval == 0:
            current = time.time()
            elapsed = current - proc_start
            mbs = total_bytes_processed / elapsed / 1024 / 1024
            print(
                f"Processed {i} files",
                f"({i/elapsed} files/s, {mbs} MB/s).",
                file=sys.stderr)

    if not args.no_merge:
        output_path = merge_file(out_paths, args.output_path)
        if not args.no_shuffle:
            shuffle_file(output_path)


if __name__ == "__main__":
    main()


================================================
FILE: ppfleetx/data/dataset/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .vision_dataset import (
    GeneralClsDataset,
    ImageFolder,
    CIFAR10,
    ContrativeLearningDataset, )

from .multimodal_dataset import ImagenDataset
from .gpt_dataset import GPTDataset, LM_Eval_Dataset, Lambada_Eval_Dataset
from .glue_dataset import *
from .ernie.ernie_dataset import ErnieDataset, ErnieSeqClsDataset


================================================
FILE: ppfleetx/data/dataset/ernie/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/dataset/ernie/dataset_utils.py
================================================
# coding=utf-8

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Most of the code here has been copied from:
#   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
# with some modifications.

import math
import os
import re
import time
import collections

import numpy as np
import paddle


def get_local_rank():
    return int(os.getenv("PADDLE_RANK_IN_NODE", 0))


print_rank_0 = print

# COMPILED = False
# DSET_TYPE_BERT = 'standard_bert'
# DSET_TYPE_T5 = 't5'
# DSET_TYPE_ERNIE = 'ernie'

# DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_T5, DSET_TYPE_ERNIE]


def get_datasets_weights_and_num_samples(data_prefix,
                                         train_valid_test_num_samples):

    # The data prefix should be in the format of:
    #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
    assert len(data_prefix) % 2 == 0
    num_datasets = len(data_prefix) // 2
    weights = [0] * num_datasets
    prefixes = [0] * num_datasets
    for i in range(num_datasets):
        weights[i] = float(data_prefix[2 * i])
        prefixes[i] = (data_prefix[2 * i + 1]).strip()
    # Normalize weights
    weight_sum = 0.0
    for weight in weights:
        weight_sum += weight
    assert weight_sum > 0.0
    weights = [weight / weight_sum for weight in weights]

    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
    # not uniformly distribute the number of samples, we still have
    # samples left to feed to the network.
    datasets_train_valid_test_num_samples = []
    for weight in weights:
        datasets_train_valid_test_num_samples.append([
            int(math.ceil(val * weight * 1.005))
            for val in train_valid_test_num_samples
        ])

    return prefixes, weights, datasets_train_valid_test_num_samples


class MMapIndexedDataset(paddle.io.Dataset):
    def __init__(self, path, skip_warmup=False):
        super().__init__()

        self._path = path

        # All documment ids, extend as 1-D array.

        for suffix in ["_ids.npy", "_idx.npz"]:
            # print(path, suffix)
            if not os.path.isfile(path + suffix):
                raise ValueError("File Not found, %s" % (path + suffix))

        self._token_ids = np.load(
            path + "_ids.npy", mmap_mode="r", allow_pickle=True)
        process_data = np.load(path + "_idx.npz")
        self._sizes = process_data["lens"]
        self._pointers = np.empty(len(self._sizes) + 1, dtype=np.int64)
        self._pointers[0] = 0
        np.cumsum(self._sizes, out=self._pointers[1:])
        self._doc_idx = process_data["docs"]

    def __getstate__(self):
        return self._path

    def __len__(self):
        return len(self._sizes)

    # @lru_cache(maxsize=8)
    def __getitem__(self, idx):
        if isinstance(idx, int):
            size = self._sizes[idx]
            ptr = self._pointers[idx]
            np_array = self._token_ids[ptr:ptr + size]
            return np_array

        elif isinstance(idx, slice):
            start, stop, step = idx.indices(len(self))
            if step != 1:
                raise ValueError(
                    "Slices into indexed_dataset must be contiguous")
            ptr = self._pointers[start]
            sizes = self._sizes[idx]
            offsets = list(accumulate(sizes))
            total_size = sum(sizes)
            np_array = self._token_ids[ptr:ptr + total_size]
            sents = np.split(np_array, offsets[:-1])
            return sents

    def get(self, idx, offset=0, length=None):
        """ Retrieves a single item from the dataset with the option to only
        return a portion of the item.

        get(idx) is the same as [idx] but get() does not support slicing.
        """
        size = self._sizes[idx]
        ptr = self._pointers[idx]

        if length is None:
            length = size - offset
        ptr += offset
        np_array = self._token_ids[ptr:prt + length]
        return np_array

    @property
    def sizes(self):
        return self._sizes

    @property
    def doc_idx(self):
        return self._doc_idx

    def get_doc_idx(self):
        return self._doc_idx

    def set_doc_idx(self, doc_idx_):
        self._doc_idx = doc_idx_


def make_indexed_dataset(data_prefix, data_impl=None, skip_warmup=False):
    return MMapIndexedDataset(data_prefix)


def get_a_and_b_segments(sample, np_rng):
    """Divide sample into a and b segments."""

    # Number of sentences in the sample.
    n_sentences = len(sample)
    # Make sure we always have two sentences.
    assert n_sentences > 1, 'make sure each sample has at least two sentences.'

    # First part:
    # `a_end` is how many sentences go into the `A`.
    a_end = 1
    if n_sentences >= 3:
        # Note that randin in numpy is exclusive.
        a_end = np_rng.randint(1, n_sentences)
    tokens_a = []
    for j in range(a_end):
        tokens_a.extend(sample[j])

    # Second part:
    tokens_b = []
    for j in range(a_end, n_sentences):
        tokens_b.extend(sample[j])

    # Random next:
    is_next_random = False
    if np_rng.random() < 0.5:
        is_next_random = True
        tokens_a, tokens_b = tokens_b, tokens_a

    return tokens_a, tokens_b, is_next_random


def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens,
                      np_rng):
    """Truncates a pair of sequences to a maximum sequence length."""
    #print(len_a, len_b, max_num_tokens)
    assert len_a > 0
    if len_a + len_b <= max_num_tokens:
        return False
    while len_a + len_b > max_num_tokens:
        if len_a > len_b:
            len_a -= 1
            tokens = tokens_a
        else:
            len_b -= 1
            tokens = tokens_b
        if np_rng.random() < 0.5:
            del tokens[0]
        else:
            tokens.pop()
    return True


def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""

    tokens = []
    tokentypes = []
    # [CLS].
    tokens.append(cls_id)
    tokentypes.append(0)
    # Segment A.
    for token in tokens_a:
        tokens.append(token)
        tokentypes.append(0)
    # [SEP].
    tokens.append(sep_id)
    tokentypes.append(0)
    # Segment B.
    for token in tokens_b:
        tokens.append(token)
        tokentypes.append(1)
    if tokens_b:
        # [SEP].
        tokens.append(sep_id)
        tokentypes.append(1)

    return tokens, tokentypes


MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])


def is_start_piece(piece):
    """Check if the current word piece is the starting piece (BERT)."""
    # When a word has been split into
    # WordPieces, the first token does not have any marker and any subsequence
    # tokens are prefixed with ##. So whenever we see the ## token, we
    # append it to the previous set of word indexes.
    return not piece.startswith("##")


def create_masked_lm_predictions(tokens,
                                 vocab_id_list,
                                 vocab_id_to_token_dict,
                                 masked_lm_prob,
                                 cls_id,
                                 sep_id,
                                 mask_id,
                                 max_predictions_per_seq,
                                 np_rng,
                                 max_ngrams=3,
                                 vocab_token_to_id_dict=None,
                                 do_whole_word_mask=True,
                                 favor_longer_ngram=False,
                                 do_permutation=False,
                                 geometric_dist=False,
                                 to_chinese_char=False,
                                 inplace_random_mask=False,
                                 masking_style="bert"):
    """Creates the predictions for the masked LM objective.
    Note: Tokens here are vocab ids and not text tokens."""

    cand_indexes = []
    # Note(mingdachen): We create a list for recording if the piece is
    # the starting piece of current token, where 1 means true, so that
    # on-the-fly whole word masking is possible.
    token_boundary = [0] * len(tokens)

    for (i, token) in enumerate(tokens):
        if token == cls_id or token == sep_id:
            token_boundary[i] = 1
            continue
        # Whole Word Masking means that if we mask all of the wordpieces
        # corresponding to an original word.
        #
        # Note that Whole Word Masking does *not* change the training code
        # at all -- we still predict each WordPiece independently, softmaxed
        # over the entire vocabulary.
        vocab_id = vocab_id_to_token_dict[token]
        if (do_whole_word_mask and len(cand_indexes) >= 1 and
                not is_start_piece(vocab_id)):
            cand_indexes[-1].append(i)
        else:
            cand_indexes.append([i])
            if is_start_piece(vocab_id_to_token_dict[token]):
                token_boundary[i] = 1

    if to_chinese_char:
        # set ## chinse char to original chinese char
        char_tokens = []
        assert vocab_token_to_id_dict is not None
        for i, b in enumerate(token_boundary):
            if b == 0:
                vocab_id = vocab_id_to_token_dict[tokens[i]]
                new_vocab_id = vocab_id[2:] if len(
                    re.findall('##[\u4E00-\u9FA5]',
                               vocab_id)) > 0 else vocab_id
                char_tokens.append(vocab_token_to_id_dict[new_vocab_id]
                                   if new_vocab_id in vocab_token_to_id_dict
                                   else token)
            else:
                char_tokens.append(tokens[i])
        output_tokens = list(char_tokens)
    else:
        output_tokens = list(tokens)

    masked_lm_positions = []
    masked_lm_labels = []

    if masked_lm_prob == 0:
        return (output_tokens, masked_lm_positions, masked_lm_labels,
                token_boundary)

    # NOTE(shenliang03): to avoid num_to_predict < 1
    num_to_predict = max(1,
                         min(max_predictions_per_seq,
                             max(1, int(round(len(tokens) * masked_lm_prob)))))

    ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
    if not geometric_dist:
        # Note(mingdachen):
        # By default, we set the probilities to favor shorter ngram sequences.
        pvals = 1. / np.arange(1, max_ngrams + 1)
        pvals /= pvals.sum(keepdims=True)
        if favor_longer_ngram:
            pvals = pvals[::-1]

    ngram_indexes = []
    for idx in range(len(cand_indexes)):
        ngram_index = []
        for n in ngrams:
            ngram_index.append(cand_indexes[idx:idx + n])
        ngram_indexes.append(ngram_index)

    np_rng.shuffle(ngram_indexes)

    (masked_lms, masked_spans) = ([], [])
    covered_indexes = set()
    backup_output_tokens = list(output_tokens)
    for cand_index_set in ngram_indexes:
        if len(masked_lms) >= num_to_predict:
            break
        if not cand_index_set:
            continue
        # Note(mingdachen):
        # Skip current piece if they are covered in lm masking or previous ngrams.
        for index_set in cand_index_set[0]:
            for index in index_set:
                if index in covered_indexes:
                    continue

        if not geometric_dist:
            n = np_rng.choice(
                ngrams[:len(cand_index_set)],
                p=pvals[:len(cand_index_set)] /
                pvals[:len(cand_index_set)].sum(keepdims=True))
        else:
            # Sampling "n" from the geometric distribution and clipping it to
            # the max_ngrams. Using p=0.2 default from the SpanBERT paper
            # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
            n = min(np_rng.geometric(0.2), max_ngrams)

        index_set = sum(cand_index_set[n - 1], [])
        n -= 1
        # Note(mingdachen):
        # Repeatedly looking for a candidate that does not exceed the
        # maximum number of predictions by trying shorter ngrams.
        while len(masked_lms) + len(index_set) > num_to_predict:
            if n == 0:
                break
            index_set = sum(cand_index_set[n - 1], [])
            n -= 1
        # If adding a whole-word mask would exceed the maximum number of
        # predictions, then just skip this candidate.
        if len(masked_lms) + len(index_set) > num_to_predict:
            continue
        is_any_index_covered = False
        for index in index_set:
            if index in covered_indexes:
                is_any_index_covered = True
                break
        if is_any_index_covered:
            continue
        for index in index_set:
            covered_indexes.add(index)
            masked_token = None
            if masking_style == "bert":
                # 80% of the time, replace with [MASK]
                if np_rng.random() < 0.8:
                    masked_token = mask_id
                else:
                    # 10% of the time, keep original
                    if np_rng.random() < 0.5:
                        masked_token = output_tokens[index]
                    # 10% of the time, replace with random word
                    else:
                        if inplace_random_mask:
                            masked_token = backup_output_tokens[np_rng.randint(
                                0, len(output_tokens))]
                        else:
                            masked_token = vocab_id_list[np_rng.randint(
                                0, len(vocab_id_list))]
            elif masking_style == "t5":
                masked_token = mask_id
            else:
                raise ValueError("invalid value of masking style")

            output_tokens[index] = masked_token
            masked_lms.append(
                MaskedLmInstance(
                    index=index, label=backup_output_tokens[index]))

        masked_spans.append(
            MaskedLmInstance(
                index=index_set,
                label=[backup_output_tokens[index] for index in index_set]))

    assert len(masked_lms) <= num_to_predict
    np_rng.shuffle(ngram_indexes)

    select_indexes = set()
    if do_permutation:
        for cand_index_set in ngram_indexes:
            if len(select_indexes) >= num_to_predict:
                break
            if not cand_index_set:
                continue
            # Note(mingdachen):
            # Skip current piece if they are covered in lm masking or previous ngrams.
            for index_set in cand_index_set[0]:
                for index in index_set:
                    if index in covered_indexes or index in select_indexes:
                        continue

            n = np.random.choice(
                ngrams[:len(cand_index_set)],
                p=pvals[:len(cand_index_set)] /
                pvals[:len(cand_index_set)].sum(keepdims=True))
            index_set = sum(cand_index_set[n - 1], [])
            n -= 1

            while len(select_indexes) + len(index_set) > num_to_predict:
                if n == 0:
                    break
                index_set = sum(cand_index_set[n - 1], [])
                n -= 1
            # If adding a whole-word mask would exceed the maximum number of
            # predictions, then just skip this candidate.
            if len(select_indexes) + len(index_set) > num_to_predict:
                continue
            is_any_index_covered = False
            for index in index_set:
                if index in covered_indexes or index in select_indexes:
                    is_any_index_covered = True
                    break
            if is_any_index_covered:
                continue
            for index in index_set:
                select_indexes.add(index)
        assert len(select_indexes) <= num_to_predict

        select_indexes = sorted(select_indexes)
        permute_indexes = list(select_indexes)
        np_rng.shuffle(permute_indexes)
        orig_token = list(output_tokens)

        for src_i, tgt_i in zip(select_indexes, permute_indexes):
            output_tokens[src_i] = orig_token[tgt_i]
            masked_lms.append(
                MaskedLmInstance(
                    index=src_i, label=orig_token[src_i]))

    masked_lms = sorted(masked_lms, key=lambda x: x.index)
    # Sort the spans by the index of the first span
    masked_spans = sorted(masked_spans, key=lambda x: x.index[0])

    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_labels.append(p.label)
    return (output_tokens, masked_lm_positions, masked_lm_labels,
            token_boundary, masked_spans)


def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
                             masked_labels, pad_id, max_seq_length):
    """Pad sequences and convert them to numpy."""

    # Some checks.
    num_tokens = len(tokens)
    padding_length = max_seq_length - num_tokens
    assert padding_length >= 0
    assert len(tokentypes) == num_tokens
    assert len(masked_positions) == len(masked_labels)

    # Tokens and token types.
    filler = [pad_id] * padding_length
    tokens_np = np.array(tokens + filler, dtype=np.int64)
    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)

    # Padding mask.
    padding_mask_np = np.array(
        [1] * num_tokens + [0] * padding_length, dtype=np.int64)

    # Lables and loss mask.
    labels = [-1] * max_seq_length
    loss_mask = [0] * max_seq_length
    for i in range(len(masked_positions)):
        assert masked_positions[i] < num_tokens
        labels[masked_positions[i]] = masked_labels[i]
        loss_mask[masked_positions[i]] = 1
    labels_np = np.array(labels, dtype=np.int64)
    loss_mask_np = np.array(loss_mask, dtype=np.int64)

    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np


def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):

    print_rank_0(' > building dataset index ...')

    start_time = time.time()
    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
    print_rank_0(' > finished creating indexed dataset in {:4f} '
                 'seconds'.format(time.time() - start_time))

    print_rank_0(' > indexed dataset stats:')
    print_rank_0('    number of documents: {}'.format(
        indexed_dataset.doc_idx.shape[0] - 1))
    print_rank_0('    number of sentences: {}'.format(
        indexed_dataset.sizes.shape[0]))

    return indexed_dataset


def get_train_valid_test_split_(splits_string, size):
    """ Get dataset splits from comma or '/' separated string list."""

    splits = []
    if splits_string.find(',') != -1:
        splits = [float(s) for s in splits_string.split(',')]
    elif splits_string.find('/') != -1:
        splits = [float(s) for s in splits_string.split('/')]
    else:
        splits = [float(splits_string)]
    while len(splits) < 3:
        splits.append(0.)
    splits = splits[:3]
    splits_sum = sum(splits)
    assert splits_sum > 0.0
    splits = [split / splits_sum for split in splits]
    splits_index = [0]
    for index, split in enumerate(splits):
        splits_index.append(splits_index[index] + int(
            round(split * float(size))))
    diff = splits_index[-1] - size
    for index in range(1, len(splits_index)):
        splits_index[index] -= diff
    assert len(splits_index) == 4
    assert splits_index[-1] == size
    return splits_index


def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
                        max_num_samples, max_seq_length, short_seq_prob, seed,
                        name, binary_head, share_folder):
    """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""

    if not num_epochs:
        if not max_num_samples:
            raise ValueError("Need to specify either max_num_samples "
                             "or num_epochs")
        num_epochs = np.iinfo(np.int32).max - 1
    if not max_num_samples:
        max_num_samples = np.iinfo(np.int64).max - 1

    # Filename of the index mapping
    indexmap_filename = data_prefix
    indexmap_filename += '_{}_indexmap'.format(name)
    if num_epochs != (np.iinfo(np.int32).max - 1):
        indexmap_filename += '_{}ep'.format(num_epochs)
    if max_num_samples != (np.iinfo(np.int64).max - 1):
        indexmap_filename += '_{}mns'.format(max_num_samples)
    indexmap_filename += '_{}msl'.format(max_seq_length)
    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
    indexmap_filename += '_{}s'.format(seed)
    indexmap_filename += '.npy'

    local_rank = get_local_rank()
    if share_folder:
        local_rank = paddle.distributed.get_rank()
    # Build the indexed mapping if not exist.

    if local_rank == 0 and \
       not os.path.isfile(indexmap_filename):
        print(' > WARNING: could not find index map file {}, building '
              'the indices on rank 0 ...'.format(indexmap_filename))

        # Make sure the types match the helpers input types.
        assert indexed_dataset.doc_idx.dtype == np.int64
        print(indexed_dataset.sizes.dtype)
        assert indexed_dataset.sizes.dtype == np.int32

        try:
            import ppfleetx.data.data_tools.cpp.fast_index_map_helpers as ernie_fast_index_map_helpers
        except Exception as e:
            start_time = time.time()
            print('> compiling dataset index builder ...')
            from ppfleetx.data.data_tools.cpp.compile import compile_helper
            compile_helper()
            print(
                '>>> done with dataset index builder. Compilation time: {:.3f} '
                'seconds'.format(time.time() - start_time),
                flush=True)
            import ppfleetx.data.data_tools.cpp.fast_index_map_helpers as ernie_fast_index_map_helpers

        samples_mapping = ernie_fast_index_map_helpers.build_mapping(
            indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs,
            max_num_samples, max_seq_length, short_seq_prob, seed, True, 2
            if binary_head else 1)
        print_rank_0(' > done building sapmles index maping')
        start_time = time.time()
        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
        print_rank_0(' > saved the index mapping in {}'.format(
            indexmap_filename))
        # Make sure all the ranks have built the mapping
        print_rank_0(' > elasped time to build and save samples mapping '
                     '(seconds): {:4f}'.format(time.time() - start_time))

    else:
        while True:
            if (not os.path.isfile(indexmap_filename)):
                time.sleep(3)
            else:
                try:
                    np.load(
                        indexmap_filename, allow_pickle=True, mmap_mode='r')
                    break
                except Exception as e:
                    print(
                        "%s file is still writing or damaged, please wait a moment."
                        % indexmap_filename)
                    time.sleep(3)

    # This should be a barrier but nccl barrier assumes
    # device_index=rank which is not the case for model
    # parallel case
    if paddle.distributed.get_world_size() > 1:
        if paddle.in_dynamic_mode():
            paddle.distributed.barrier()

    # Load indexed dataset.
    print_rank_0(' > loading indexed mapping from {}'.format(
        indexmap_filename))
    start_time = time.time()
    samples_mapping = np.load(
        indexmap_filename, allow_pickle=True, mmap_mode='r')
    print_rank_0('    loaded indexed file in {:3.3f} seconds'.format(time.time(
    ) - start_time))
    print_rank_0('    total number of samples: {}'.format(
        samples_mapping.shape[0]))

    return samples_mapping


================================================
FILE: ppfleetx/data/dataset/ernie/ernie_dataset.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import time
import numpy as np
import re
import copy
from functools import partial
import paddle

from .dataset_utils import (
    get_samples_mapping,
    get_a_and_b_segments,
    truncate_segments,
    create_tokens_and_tokentypes,
    create_masked_lm_predictions,
    make_indexed_dataset,
    get_indexed_dataset_, )
from paddlenlp.transformers import ErnieTokenizer
from paddlenlp.datasets.dataset import MapDataset, IterableDataset, SimpleBuilder, load_dataset


def get_local_rank():
    return int(os.getenv("PADDLE_RANK_IN_NODE", 0))


print_rank_0 = print

mode_to_index = {"Train": 0, "Eval": 1, "Test": 2}
mode_to_key = {"Train": "train", "Eval": "dev", "Test": "test"}


class ErnieDataset(paddle.io.Dataset):
    def __init__(self, input_dir, tokenizer_type, split, num_samples, mode,
                 max_seq_length, masked_lm_prob, short_seq_prob, seed,
                 binary_head, share_folder, favor_longer_ngram, max_ngrams):
        tokenizer = ErnieTokenizer.from_pretrained(tokenizer_type)
        tokenizer.extend_chinese_char()

        files = get_train_data_file(input_dir)[0]
        skip_warmup = True
        indexed_dataset = get_indexed_dataset_(files, None, skip_warmup)
        total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
        splits = get_train_valid_test_split_(split, total_num_of_documents)
        # Print stats about the splits.
        print_rank_0(' > dataset split:')

        def print_split_stats(name, index):
            print_rank_0('    {}:'.format(name))
            print_rank_0('     document indices in [{}, {}) total of {} '
                         'documents'.format(splits[index], splits[index + 1],
                                            splits[index + 1] - splits[index]))
            start_index = indexed_dataset.doc_idx[splits[index]]
            end_index = indexed_dataset.doc_idx[splits[index + 1]]
            print_rank_0('     sentence indices in [{}, {}) total of {} '
                         'sentences'.format(start_index, end_index, end_index -
                                            start_index))

        index = mode_to_index[mode]
        print_split_stats(mode, index)

        # dataset = None
        assert splits[index + 1] > splits[index]
        # Get the pointer to the original doc-idx so we can set it later.
        doc_idx_ptr = indexed_dataset.get_doc_idx()
        # Slice the doc-idx
        start_index = splits[index]
        # Add +1 so we can index into the dataset to get the upper bound.
        end_index = splits[index + 1] + 1
        # New doc_idx view.
        indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
        # Build the dataset accordingly.
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.binary_head = binary_head
        self.share_folder = share_folder
        self.indexed_dataset = indexed_dataset

        self.favor_longer_ngram = favor_longer_ngram
        self.max_ngrams = max_ngrams

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            files,
            None,
            num_samples,
            self.max_seq_length - 3,  # account for added tokens
            short_seq_prob,
            self.seed,
            mode,
            self.binary_head,
            self.share_folder)

        self.vocab_id_list = list(tokenizer.vocab.idx_to_token.keys())
        self.vocab_id_to_token_dict = copy.deepcopy(
            tokenizer.vocab.idx_to_token)
        self.vocab_token_to_id_dict = copy.deepcopy(
            tokenizer.vocab.token_to_idx)

        # ERNIE is chinse char level model, sometime is need
        # add ## chinse char to encode and decode.
        # Here we extend the vocab dict.
        self.vocab_id_to_token_dict.update(tokenizer.added_tokens_decoder)
        self.vocab_token_to_id_dict.update(tokenizer.added_tokens_encoder)

        self.cls_id = tokenizer.cls_token_id
        self.sep_id = tokenizer.sep_token_id
        self.mask_id = tokenizer.mask_token_id
        self.pad_id = tokenizer.pad_token_id

    def __len__(self):
        return self.samples_mapping.shape[0]

    def __getitem__(self, idx):

        start_idx, end_idx, seq_length = self.samples_mapping[idx]
        sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]

        # Note that this rng state should be numpy and not python since
        # python randint is inclusive whereas the numpy one is exclusive.
        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
        return build_training_sample(
            sample,
            seq_length,
            self.max_seq_length,  # needed for padding
            self.vocab_id_list,
            self.vocab_id_to_token_dict,
            self.vocab_token_to_id_dict,
            self.cls_id,
            self.sep_id,
            self.mask_id,
            self.pad_id,
            self.masked_lm_prob,
            np_rng,
            self.binary_head,
            self.favor_longer_ngram,
            self.max_ngrams)


def build_training_sample(sample,
                          target_seq_length,
                          max_seq_length,
                          vocab_id_list,
                          vocab_id_to_token_dict,
                          vocab_token_to_id_dict,
                          cls_id,
                          sep_id,
                          mask_id,
                          pad_id,
                          masked_lm_prob,
                          np_rng,
                          binary_head,
                          favor_longer_ngram=False,
                          max_ngrams=3):
    """Biuld training sample.

    Arguments:
        sample: A list of sentences in which each sentence is a list token ids.
        target_seq_length: Desired sequence length.
        max_seq_length: Maximum length of the sequence. All values are padded to
            this length.
        vocab_id_list: List of vocabulary ids. Used to pick a random id.
        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
        vocab_token_to_id_dict: A dictionary from text tokens to vocab ids.
        cls_id: Start of example id.
        sep_id: Separator id.
        mask_id: Mask token id.
        pad_id: Padding token id.
        masked_lm_prob: Probability to mask tokens.
        np_rng: Random number genenrator. Note that this rng state should be
              numpy and not python since python randint is inclusive for
              the opper bound whereas the numpy one is exclusive.
    """

    if binary_head:
        # We assume that we have at least two sentences in the sample
        assert len(sample) > 1, "The sentence num should be large than 1."
    assert target_seq_length <= max_seq_length

    # Divide sample into two segments (A and B).
    if binary_head:
        tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample,
                                                                  np_rng)
    else:
        tokens_a = []
        for j in range(len(sample)):
            tokens_a.extend(sample[j])
        tokens_b = []
        is_next_random = False

    # Truncate to `target_sequence_length`.
    max_num_tokens = target_seq_length
    truncated = truncate_segments(tokens_a, tokens_b,
                                  len(tokens_a),
                                  len(tokens_b), max_num_tokens, np_rng)

    # Build tokens and toketypes.
    tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
                                                      cls_id, sep_id)

    # Masking.
    max_predictions_per_seq = masked_lm_prob * max_num_tokens
    (tokens, masked_positions, masked_labels, _,
     _) = create_masked_lm_predictions(
         tokens,
         vocab_id_list,
         vocab_id_to_token_dict,
         masked_lm_prob,
         cls_id,
         sep_id,
         mask_id,
         max_predictions_per_seq,
         np_rng,
         vocab_token_to_id_dict=vocab_token_to_id_dict,
         to_chinese_char=True,
         inplace_random_mask=False,
         favor_longer_ngram=favor_longer_ngram,
         max_ngrams=max_ngrams, )

    # Padding.
    tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
        = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
                                   masked_labels, pad_id, max_seq_length)

    return tokens_np, tokentypes_np, padding_mask_np, masked_positions, masked_labels, int(
        is_next_random)


def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
                             masked_labels, pad_id, max_seq_length):
    """Pad sequences and convert them to numpy."""

    # Some checks.
    num_tokens = len(tokens)
    padding_length = max_seq_length - num_tokens
    assert padding_length >= 0
    assert len(tokentypes) == num_tokens
    assert len(masked_positions) == len(masked_labels)

    # Tokens and token types.
    filler = [pad_id] * padding_length
    tokens_np = np.array(tokens + filler, dtype=np.int64)
    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)

    # Padding mask.
    padding_mask_np = np.array(
        [1] * num_tokens + [0] * padding_length, dtype=np.float32)
    padding_mask_np = (1 - padding_mask_np) * -1e4

    padding_mask_np = padding_mask_np.reshape([1, 1, -1])
    # Lables and loss mask.
    labels = [-1] * max_seq_length
    loss_mask = [0] * max_seq_length
    for i in range(len(masked_positions)):
        assert masked_positions[i] < num_tokens
        labels[masked_positions[i]] = masked_labels[i]
        loss_mask[masked_positions[i]] = 1
    labels_np = np.array(labels, dtype=np.int64)
    loss_mask_np = np.array(loss_mask, dtype=np.int64)

    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np


def get_train_data_file(input_dir):
    if len(input_dir.split()) > 1:
        # weight-1 data-prefix-1 weight-2 data-prefix-2 ...
        return input_dir.split()
    else:
        files = [
            os.path.join(input_dir, f) for f in os.listdir(input_dir)
            if (os.path.isfile(os.path.join(input_dir, f)) and "_idx.npz" in
                str(f))
        ]
        files = [x.replace("_idx.npz", "") for x in files]

        if len(files) > 1:
            ret = []
            logger.info("You are using multi-dataset:")
            for x in files:
                ret.append(1.0)
                ret.append(x)
                logger.info("    > set weight of %s dataset to 1.0" % x)
            return ret
    return files


def get_train_valid_test_split_(splits, size):
    """
    Get dataset splits from comma or '/' separated string list.
    """

    splits = [float(s) for s in splits]
    while len(splits) < 3:
        splits.append(0.)
    splits = splits[:3]
    splits_sum = sum(splits)
    assert splits_sum > 0.0
    splits = [split / splits_sum for split in splits]
    splits_index = [0]
    for index, split in enumerate(splits):
        splits_index.append(splits_index[index] + int(
            round(split * float(size))))
    diff = splits_index[-1] - size
    for index in range(1, len(splits_index)):
        splits_index[index] -= diff
    assert len(splits_index) == 4
    assert splits_index[-1] == size
    return splits_index


class ErnieSeqClsDataset(paddle.io.Dataset):
    def __init__(self, dataset_type, tokenizer_type, max_seq_len, mode):
        self.dataset = dataset_type
        self.max_seq_len = max_seq_len
        self.mode = mode_to_key[mode]

        from ppfleetx.data.tokenizers import get_ernie_tokenizer
        self.tokenizer = get_ernie_tokenizer(tokenizer_type)

        dataset_config = self.dataset.split(" ")
        raw_datasets = load_dataset(
            dataset_config[0],
            None if len(dataset_config) <= 1 else dataset_config[1], )
        self.label_list = getattr(raw_datasets['train'], "label_list", None)

        # Define dataset pre-process function
        if "clue" in self.dataset:
            trans_fn = partial(self._clue_trans_fn)
        else:
            trans_fn = partial(self._seq_trans_fn)

        self.seqcls_dataset = raw_datasets[self.mode].map(trans_fn)

    def __getitem__(self, idx):
        return self.seqcls_dataset.__getitem__(idx)

    def __len__(self):
        return self.seqcls_dataset.__len__()

    def _seq_trans_fn(self, example):
        return self._convert_example(
            example,
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_len, )

    def _clue_trans_fn(self, example):
        return self._convert_clue(
            example,
            label_list=self.label_list,
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_len, )

    def _convert_example(self,
                         example,
                         tokenizer,
                         max_seq_length=512,
                         is_test=False):
        is_test = True
        if 'label' in example.keys():
            is_test = False

        if "text_b" in example.keys():
            text = example["text_a"]
            text_pair = example["text_b"]
        else:
            text = example["text"]
            text_pair = None

        encoded_inputs = tokenizer(
            text=text, text_pair=text_pair, max_seq_len=max_seq_length)
        input_ids = encoded_inputs["input_ids"]
        token_type_ids = encoded_inputs["token_type_ids"]

        if is_test:
            return {
                "input_ids": input_ids,
                "token_type_ids": token_type_ids,
            }
        else:
            # label = np.array([example["label"]], dtype="int64")
            label = int(example["label"])
            return {
                "input_ids": input_ids,
                "token_type_ids": token_type_ids,
                "labels": label
            }

    # Data pre-process function for clue benchmark datatset
    def _convert_clue(self,
                      example,
                      label_list,
                      tokenizer=None,
                      max_seq_length=512,
                      **kwargs):
        """convert a glue example into necessary features"""
        is_test = False
        if 'label' not in example.keys():
            is_test = True

        if not is_test:
            # `label_list == None` is for regression task
            label_dtype = "int64" if label_list else "float32"
            # Get the label
            example['label'] = int(example[
                "label"]) if label_dtype != "float32" else float(example[
                    "label"])
            label = example['label']
        # Convert raw text to feature
        if 'keyword' in example:  # CSL
            sentence1 = " ".join(example['keyword'])
            example = {
                'sentence1': sentence1,
                'sentence2': example['abst'],
                'label': example['label']
            }
        elif 'target' in example:  # wsc
            text, query, pronoun, query_idx, pronoun_idx = example[
                'text'], example['target']['span1_text'], example['target'][
                    'span2_text'], example['target']['span1_index'], example[
                        'target']['span2_index']
            text_list = list(text)
            assert text[pronoun_idx:(pronoun_idx + len(
                pronoun))] == pronoun, "pronoun: {}".format(pronoun)
            assert text[query_idx:(query_idx + len(query)
                                   )] == query, "query: {}".format(query)
            if pronoun_idx > query_idx:
                text_list.insert(query_idx, "_")
                text_list.insert(query_idx + len(query) + 1, "_")
                text_list.insert(pronoun_idx + 2, "[")
                text_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]")
            else:
                text_list.insert(pronoun_idx, "[")
                text_list.insert(pronoun_idx + len(pronoun) + 1, "]")
                text_list.insert(query_idx + 2, "_")
                text_list.insert(query_idx + len(query) + 2 + 1, "_")
            text = "".join(text_list)
            example['sentence'] = text

        if tokenizer is None:
            return example
        if 'sentence' in example:
            example = tokenizer(
                example['sentence'], max_seq_len=max_seq_length)
        elif 'sentence1' in example:
            example = tokenizer(
                example['sentence1'],
                text_pair=example['sentence2'],
                max_seq_len=max_seq_length)

        if not is_test:
            if "token_type_ids" in example:
                return {
                    "input_ids": example['input_ids'],
                    "token_type_ids": example['token_type_ids'],
                    "labels": label
                }
            else:
                return {"input_ids": example['input_ids'], "labels": label}
        else:
            return {
                "input_ids": example['input_ids'],
                "token_type_ids": example['token_type_ids']
            }


================================================
FILE: ppfleetx/data/dataset/glue_dataset.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import numpy as np

import paddle

from ppfleetx.data.tokenizers import GPTTokenizer
from ppfleetx.utils.download import cached_path
from ppfleetx.utils.file import unzip, parse_csv

__all__ = [
    'CoLA', 'SST2', 'MNLI', 'QNLI', 'RTE', 'WNLI', 'MRPC', 'QQP', 'STSB'
]
"""

Single-Sentence Tasks:
* CoLA
* SST-2


Similarity and Paraphrase Tasks:
* MRPC
* STS-B
* QQP


Inference Tasks:
* MNLI
* QNLI
* RTE
* WNLI
"""


class CoLA(paddle.io.Dataset):
    """The Corpus of Linguistic Acceptability consists of English
    acceptability judgments drawn from books and journal articles on
    linguistic theory. Each example is a sequence of words annotated
    with whether it is a grammatical English sentence."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/cola.html#CoLA

    URL = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"
    MD5 = "9f6d88c3558ec424cd9d66ea03589aba"

    NUM_LINES = {
        "train": 8551,
        "dev": 527,
        "test": 516,
    }

    _PATH = "cola_public_1.1.zip"

    DATASET_NAME = "CoLA"

    _EXTRACTED_FILES = {
        "train": os.path.join("raw", "in_domain_train.tsv"),
        "dev": os.path.join("raw", "in_domain_dev.tsv"),
        "test": os.path.join("raw", "out_of_domain_dev.tsv"),
    }

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        def _filter_res(x):
            return len(x) == 4

        def _modify_res(x):
            return (x[3], int(x[1]))

        self.samples = parse_csv(
            self.path,
            skip_lines=1,
            delimiter="\t",
            map_funcs=_modify_res,
            filter_funcs=_filter_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            return input_ids, sample[1]
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class SST2(paddle.io.Dataset):
    """The Stanford Sentiment Treebank consists of sentences from movie reviews and
    human annotations of their sentiment. The task is to predict the sentiment of a
    given sentence. We use the two-way (positive/negative) class split, and use only
    sentence-level labels."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/sst2.html#SST2

    URL = "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip"
    MD5 = "9f81648d4199384278b86e315dac217c"

    NUM_LINES = {
        "train": 67349,
        "dev": 872,
        "test": 1821,
    }

    _PATH = "SST-2.zip"

    DATASET_NAME = "SST2"

    _EXTRACTED_FILES = {
        "train": "train.tsv",
        "dev": "dev.tsv",
        "test": "test.tsv",
    }

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        # test split for SST2 doesn't have labels
        if split == "test":

            def _modify_test_res(t):
                return (t[1].strip(), )

            self.samples = parse_csv(
                self.path,
                skip_lines=1,
                delimiter="\t",
                map_funcs=_modify_test_res)
        else:

            def _modify_res(t):
                return (t[0].strip(), int(t[1]))

            self.samples = parse_csv(
                self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            return input_ids, sample[1]
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class MNLI(paddle.io.Dataset):
    """The Multi-Genre Natural Language Inference Corpus is a crowdsourced
    collection of sentence pairs with textual entailment annotations. Given a premise sentence
    and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis
    (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are
    gathered from ten different sources, including transcribed speech, fiction, and government reports.
    We use the standard test set, for which we obtained private labels from the authors, and evaluate
    on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend
    the SNLI corpus as 550k examples of auxiliary training data."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/mnli.html#MNLI

    URL = "https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip"
    MD5 = "0f70aaf66293b3c088a864891db51353"

    NUM_LINES = {
        "train": 392702,
        "dev_matched": 9815,
        "dev_mismatched": 9832,
    }

    _PATH = "multinli_1.0.zip"

    DATASET_NAME = "MNLI"

    _EXTRACTED_FILES = {
        "train": "multinli_1.0_train.txt",
        "dev_matched": "multinli_1.0_dev_matched.txt",
        "dev_mismatched": "multinli_1.0_dev_mismatched.txt",
    }

    LABEL_TO_INT = {"entailment": 0, "neutral": 1, "contradiction": 2}

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev_matched', 'dev_mismatched']

        def _filter_res(x):
            return x[0] in self.LABEL_TO_INT

        def _modify_res(x):
            return (x[5], x[6], self.LABEL_TO_INT[x[0]])

        self.samples = parse_csv(
            self.path,
            skip_lines=1,
            delimiter="\t",
            map_funcs=_modify_res,
            filter_funcs=_filter_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        return input_ids, sample[2]

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 3


class QNLI(paddle.io.Dataset):
    """The Stanford Question Answering Dataset is a question-answering
    dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn
    from Wikipedia) contains the answer to the corresponding question (written by an annotator). We
    convert the task into sentence pair classification by forming a pair between each question and each
    sentence in the corresponding context, and filtering out pairs with low lexical overlap between the
    question and the context sentence. The task is to determine whether the context sentence contains
    the answer to the question. This modified version of the original task removes the requirement that
    the model select the exact answer, but also removes the simplifying assumptions that the answer
    is always present in the input and that lexical overlap is a reliable cue."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/qnli.html#QNLI

    URL = "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip"
    MD5 = "b4efd6554440de1712e9b54e14760e82"

    NUM_LINES = {
        "train": 104743,
        "dev": 5463,
        "test": 5463,
    }

    _PATH = "QNLIv2.zip"

    DATASET_NAME = "QNLI"

    _EXTRACTED_FILES = {
        "train": "train.tsv",
        "dev": "dev.tsv",
        "test": "test.tsv",
    }

    MAP_LABELS = {"entailment": 0, "not_entailment": 1}

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        def _modify_res(x):
            if split == 'test':
                # test split for QNLI doesn't have labels
                return (x[1], x[2])
            else:
                return (x[1], x[2], self.MAP_LABELS[x[3]])

        self.samples = parse_csv(
            self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            return input_ids, sample[2]
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class RTE(paddle.io.Dataset):
    """The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
    entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim
    et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are
    constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where
    for three-class datasets we collapse neutral and contradiction into not entailment, for consistency."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/rte.html#RTE

    URL = "https://dl.fbaipublicfiles.com/glue/data/RTE.zip"
    MD5 = "bef554d0cafd4ab6743488101c638539"

    NUM_LINES = {
        "train": 67349,
        "dev": 872,
        "test": 1821,
    }

    _PATH = "RTE.zip"

    DATASET_NAME = "RTE"

    _EXTRACTED_FILES = {
        "train": "train.tsv",
        "dev": "dev.tsv",
        "test": "test.tsv",
    }

    MAP_LABELS = {"entailment": 0, "not_entailment": 1}

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        def _modify_res(x):
            if split == 'test':
                # test split for RTE doesn't have labels
                return (x[1], x[2])
            else:
                return (x[1], x[2], self.MAP_LABELS[x[3]])

        self.samples = parse_csv(
            self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            return input_ids, sample[2]
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class WNLI(paddle.io.Dataset):
    """The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task
    in which a system must read a sentence with a pronoun and select the referent of that pronoun from
    a list of choices. The examples are manually constructed to foil simple statistical methods: Each
    one is contingent on contextual information provided by a single word or phrase in the sentence.
    To convert the problem into sentence pair classification, we construct sentence pairs by replacing
    the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the
    pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of
    new examples derived from fiction books that was shared privately by the authors of the original
    corpus. While the included training set is balanced between two classes, the test set is imbalanced
    between them (65% not entailment). Also, due to a data quirk, the development set is adversarial:
    hypotheses are sometimes shared between training and development examples, so if a model memorizes the
    training examples, they will predict the wrong label on corresponding development set
    example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence
    between a model's score on this task and its score on the unconverted original task. We
    call converted dataset WNLI (Winograd NLI)."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/wnli.html#WNLI

    URL = "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip"
    MD5 = "a1b4bd2861017d302d29e42139657a42"

    NUM_LINES = {
        "train": 635,
        "dev": 71,
        "test": 146,
    }

    _PATH = "WNLI.zip"

    DATASET_NAME = "WNLI"

    _EXTRACTED_FILES = {
        "train": "train.tsv",
        "dev": "dev.tsv",
        "test": "test.tsv",
    }

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        def _modify_res(x):
            if split == 'test':
                # test split for WNLI doesn't have labels
                return (x[1], x[2])
            else:
                return (x[1], x[2], int(x[3]))

        self.samples = parse_csv(
            self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            return input_ids, sample[2]
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class MRPC(paddle.io.Dataset):
    """The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of
    sentence pairs automatically extracted from online news sources, with human annotations
    for whether the sentences in the pair are semantically equivalent."""

    # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/mrpc.html#MRPC

    URL = {
        "train":
        "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt",
        "test":
        "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt",
    }

    MD5 = {
        "train": "793daf7b6224281e75fe61c1f80afe35",
        "test": "e437fdddb92535b820fe8852e2df8a49",
    }

    NUM_LINES = {
        "train": 4076,
        "test": 1725,
    }

    DATASET_NAME = "MRPC"

    _EXTRACTED_FILES = {
        "train": "msr_paraphrase_train.txt",
        "test": "msr_paraphrase_test.txt",
    }

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        cached_path(self.URL[split], cache_dir=os.path.abspath(self.root))

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'test']

        def _modify_res(x):
            return (x[3], x[4], int(x[0]))

        self.samples = parse_csv(
            self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        return input_ids, sample[2]

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class QQP(paddle.io.Dataset):
    """The Quora Question Pairs2 dataset is a collection of question pairs from the
    community question-answering website Quora. The task is to determine whether a
    pair of questions are semantically equivalent."""

    # ref https://huggingface.co/datasets/glue/blob/main/glue.py#L212-L239

    URL = "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip"
    MD5 = "884bf26e39c783d757acc510a2a516ef"

    NUM_LINES = {
        "train": 363846,
        "dev": 40430,
        "test": 390961,
    }

    _PATH = "QQP-clean.zip"

    DATASET_NAME = "QQP"

    _EXTRACTED_FILES = {
        "train": "train.tsv",
        "dev": "dev.tsv",
        "test": "test.tsv",
    }

    MAP_LABELS = {"not_duplicate": 0, "duplicate": 1}

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        def _modify_res(x):
            if split == 'test':
                # test split for QQP doesn't have labels
                return (x[1], x[2])
            else:
                return (x[3], x[4], int(x[5]))

        self.samples = parse_csv(
            self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            return input_ids, sample[2]
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


class STSB(paddle.io.Dataset):
    """The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
    sentence pairs drawn from news headlines, video and image captions, and natural
    language inference data. Each pair is human-annotated with a similarity score
    from 1 to 5."""

    # ref https://huggingface.co/datasets/glue/blob/main/glue.py#L240-L267

    URL = "https://dl.fbaipublicfiles.com/glue/data/STS-B.zip"
    MD5 = "d573676be38f1a075a5702b90ceab3de"

    NUM_LINES = {
        "train": 5749,
        "dev": 1500,
        "test": 1379,
    }

    _PATH = "STS-B.zip"

    DATASET_NAME = "STSB"

    _EXTRACTED_FILES = {
        "train": "train.tsv",
        "dev": "dev.tsv",
        "test": "test.tsv",
    }

    def __init__(self, root, split, max_length=128):

        self.root = root
        self.split = split
        if os.path.exists(self.root):
            assert os.path.isdir(self.root)
        else:
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            unzip(
                zip_path,
                mode="r",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        self.path = os.path.join(self.root, self._EXTRACTED_FILES[split])
        assert os.path.exists(self.path), f"{self.path} is not exists!"
        self.max_length = max_length

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        assert split in ['train', 'dev', 'test']

        def _modify_res(x):
            if split == 'test':
                # test split for STSB doesn't have labels
                return (x[7], x[8])
            else:
                return (x[7], x[8], float(x[9]))

        self.samples = parse_csv(
            self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        encoded_inputs = self.tokenizer(
            sample[0],
            text_pair=sample[1],
            padding="max_length",
            truncation="longest_first",
            max_length=self.max_length,
            return_token_type_ids=False)
        input_ids = encoded_inputs['input_ids']
        input_ids = paddle.to_tensor(input_ids)
        if self.split != 'test':
            # Note(GuoxiaWang): We need return shape [1] value,
            # so that we can attain a batched label with shape [batchsize, 1].
            # Because the logits shape is [batchsize, 1], and feed into MSE loss.
            return input_ids, np.array([sample[2]], dtype=np.float32)
        else:
            return input_ids

    def __len__(self):
        return len(self.samples)

    @property
    def class_num(self):
        return 2


================================================
FILE: ppfleetx/data/dataset/gpt_dataset.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import time
import numpy as np
import re
import math
import json

import paddle

from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger
from ppfleetx.data.tokenizers import GPTTokenizer

# TODO(haohongxiang): to solve the problem of cross-reference
import paddlenlp
from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer

mode_to_index = {"Train": 0, "Eval": 1, "Test": 2}

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "MoE": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


class GPTDataset(paddle.io.Dataset):
    def __init__(self,
                 input_dir,
                 split,
                 max_seq_len,
                 num_samples,
                 mode,
                 model_type="GPT",
                 seed=1234):

        files = get_train_data_file(input_dir)
        files.sort()
        input_dir = [files[0]]

        local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))

        if local_rank == 0:
            try:
                import ppfleetx.data.data_tools.cpp.fast_index_map_helpers
            except Exception as e:
                start_time = time.time()
                print('> compiling dataset index builder ...')
                from ppfleetx.data.data_tools.cpp.compile import compile_helper
                compile_helper()
                print(
                    '>>> done with dataset index builder. Compilation time: {:.3f} '
                    'seconds'.format(time.time() - start_time),
                    flush=True)

        device_world_size = paddle.distributed.get_world_size()

        if device_world_size > 1 and local_rank != 0:
            while True:
                try:
                    import ppfleetx.data.data_tools.cpp.fast_index_map_helpers
                    break
                except Exception as e:
                    print("> wait for helpers to be compiled!")
                    time.sleep(1)

        try:
            data_world_size = env.get_data_world_size()

            logger.info(
                "The distributed run, total device num:{}, distinct dataflow num:{}.".
                format(device_world_size, data_world_size))
        except AttributeError:
            pass

        assert len(input_dir) == 1, "GPT only support one dataset for now."

        input_prefix = input_dir[0]

        if os.path.isfile(input_prefix + "_ids.npz"):
            logger.warning(
                "You are using compatible dataset, please make new dataset as the readme!"
            )
            process_data = np.load(
                input_prefix + "_ids.npz", mmap_mode="r+", allow_pickle=True)
            sample_ids = process_data["ids"]
            sample_lens = process_data["lens"].astype("int32")
        else:
            for suffix in ["_ids.npy", "_idx.npz"]:
                if not os.path.isfile(input_prefix + suffix):
                    raise ValueError("File Not found, %s" %
                                     (input_prefix + suffix))

            sample_ids = np.load(
                input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True)
            # All documment ids, extend as 1-D array.

            process_data = np.load(input_prefix + "_idx.npz")
            # The len(sample_lens) num of docs
            # The sum(sample_lens) should equal len(sample_ids)
            sample_lens = process_data["lens"]

        splits = get_train_valid_test_split_(split, len(sample_lens))
        assert len(sample_lens) >= splits[
            -1], "The document nums should larger than max of splits, but %s < %s" % (
                len(sample_lens), splits[-1])

        tokenizer_class, pretrained_name = MODEL_CLASSES[model_type]
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        self.input_dir = input_dir
        self.max_seq_len = max_seq_len
        self.mode = mode
        self.name = "gpt_" + mode
        self.eos_id = tokenizer.eos_token_id
        self.sample_ids = sample_ids
        self.sample_lens = sample_lens
        self.build_data_file = (local_rank == 0)

        if mode in mode_to_index.keys():
            index = mode_to_index[mode]
        else:
            raise ValueError("valid str value for 'mode'")

        documents = np.arange(splits[index], splits[index + 1])
        if documents is None:
            document_ids = np.arange(0, self.sample_lens.shape[0])
        else:
            document_ids = documents

        self.doc_idx, self.sample_idx, self.shuffle_idx = \
            construct_samples_and_shuffle_data(self.name, input_prefix, document_ids,\
                self.sample_lens, num_samples, max_seq_len, seed, self.build_data_file)

        # The doc cumsum start pos
        self.start_pos = [0] + np.cumsum(self.sample_lens).tolist()

    def _construct_sample(self, tokens):
        tokens = np.array(tokens).astype("int64").tolist()
        labels = tokens[1:]
        tokens = tokens[:-1]
        seq_length = len(tokens)
        # Attention mask for the attention calulate
        # attention_mask = np.tri(seq_length, seq_length).reshape((1, seq_length,
        #  seq_length))
        # The pad and eos tokens do not contribute the loss
        loss_mask = np.ones(seq_length, dtype="float32")
        loss_mask[tokens == self.eos_id] = 0.0
        position_ids = np.arange(0, seq_length, dtype="int64")

        labels = np.array(labels).astype("int64")
        tokens = np.array(tokens).astype("int64")
        if self.mode == "Test":
            return [tokens, position_ids]
        else:
            return [tokens, position_ids, labels, loss_mask]

    def _get_single_sample_from_idx(self, doc_index_f, doc_index_l, offset_f,
                                    offset_l):
        """
        The input means:
            doc_index_f: data from the first doc.
            doc_index_l: data from the last doc.
            offset_f: offset of the first doc.
            offset_l: offset of the last doc.
        """
        # Data from the sample doc. just select the needed ids.
        if doc_index_f == doc_index_l:
            current_start_pos = self.start_pos[self.doc_idx[doc_index_f]]
            return self.sample_ids[current_start_pos+offset_f:\
                       current_start_pos+offset_l+1].tolist()

        # Data from multi docs.
        else:
            current_start_pos = self.start_pos[self.doc_idx[doc_index_f]]
            next_start_pos = self.start_pos[self.doc_idx[doc_index_f] + 1]
            tokens = self.sample_ids[current_start_pos + offset_f:
                                     next_start_pos].tolist()
            for i in range(doc_index_f + 1, doc_index_l):
                current_start_pos = self.start_pos[self.doc_idx[i]]
                next_start_pos = self.start_pos[self.doc_idx[i] + 1]
                tokens.extend(self.sample_ids[current_start_pos:next_start_pos]
                              .tolist())
            last_start_pos = self.start_pos[self.doc_idx[doc_index_l]]
            tokens.extend(self.sample_ids[last_start_pos:last_start_pos +
                                          offset_l + 1].tolist())

        return tokens

    def __getitem__(self, index):
        idx = self.shuffle_idx[index]
        # Start and end documents and offsets.
        doc_index_f = self.sample_idx[idx][0]
        doc_index_l = self.sample_idx[idx + 1][0]
        offset_f = self.sample_idx[idx][1]
        offset_l = self.sample_idx[idx + 1][1]
        tokens = self._get_single_sample_from_idx(doc_index_f, doc_index_l,
                                                  offset_f, offset_l)
        return self._construct_sample(tokens)

    def __len__(self):
        return self.sample_idx.shape[0] - 1


def get_train_data_file(input_dir):
    files = [
        os.path.join(input_dir, f) for f in os.listdir(input_dir)
        if (os.path.isfile(os.path.join(input_dir, f)) and str(f)
            .endswith("_idx.npz"))
    ]
    files = [x.replace("_idx.npz", "") for x in files]
    if len(files) == 0:
        logger.warning(
            "Not found dataset with name of xxx_ids.npy and xxx_idx.npz! Try to found old compatible xxx_ids.npz file."
        )
    else:
        return files

    files = [
        os.path.join(input_dir, f) for f in os.listdir(input_dir)
        if (os.path.isfile(os.path.join(input_dir, f)) and str(f)
            .endswith("_ids.npz"))
    ]

    files = [x.replace("_ids.npz", "") for x in files]

    if len(files) == 0:
        raise RuntimeError(
            "Not found dataset with name of xxx_ids.npz in given input_dir '{}'! ".
            format(input_dir))
    else:
        return files


def get_train_valid_test_split_(splits, size):
    """
    Get dataset splits from comma or '/' separated string list.
    """

    splits = [float(s) for s in splits]
    while len(splits) < 3:
        splits.append(0.)
    splits = splits[:3]
    splits_sum = sum(splits)
    assert splits_sum > 0.0
    splits = [split / splits_sum for split in splits]
    splits_index = [0]
    for index, split in enumerate(splits):
        splits_index.append(splits_index[index] + int(
            round(split * float(size))))
    diff = splits_index[-1] - size
    for index in range(1, len(splits_index)):
        splits_index[index] -= diff
    assert len(splits_index) == 4
    assert splits_index[-1] == size
    return splits_index


def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,
                                       num_samples, seq_length, seed,
                                       build_data_file):
    """
    documents: document index from 0 to len(docs)
    sizes: the length list of all docs.
    num_samples: total step*bs iterations of data.
    seq_length: the sequence length.
    sum(sizes) = tokens_per_epoch
    data_nums = num_samples *  micro_batch_size
    num_epochs = (data_nums + 1) // sum(sizes)
    len(doc_idx) = num_epochs * sum(sizes)
    """
    # Number of tokens in each epoch and number of required epochs.
    tokens_per_epoch = _num_tokens(documents, sizes)
    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
    # Rng state
    np_rng = np.random.RandomState(seed=seed)

    # Filename of the index mappings.
    _filename = data_prefix
    _filename += '_{}_indexmap'.format(name)
    _filename += '_{}ns'.format(num_samples)
    _filename += '_{}sl'.format(seq_length)
    doc_idx_filename = _filename + '_doc_idx.npy'
    sample_idx_filename = _filename + '_sample_idx.npy'
    shuffle_idx_filename = _filename + '_shuffle_idx.npy'

    # Sava random state
    savedState = np_rng.get_state()
    # Build the indexed mapping if not exist.
    if build_data_file:
        if (not os.path.isfile(doc_idx_filename)) or \
           (not os.path.isfile(sample_idx_filename)) or \
           (not os.path.isfile(shuffle_idx_filename)):
            if num_epochs == 1:
                separate_last_epoch = False
            else:
                num_samples_from_epochs_minus_one = (
                    (num_epochs - 1) * tokens_per_epoch - 1) // seq_length
                last_epoch_num_samples = num_samples - \
                                         num_samples_from_epochs_minus_one
                assert last_epoch_num_samples >= 0, \
                    'last epoch number of samples should be non-negative.'
                num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
                assert last_epoch_num_samples < (num_samples_per_epoch + 1), \
                    'last epoch number of samples exceeded max value.'
                separate_last_epoch = (
                    last_epoch_num_samples < int(0.80 * num_samples_per_epoch))
            # Note. len(doc_idx) = num_epochs * len(doc)
            start_time = time.time()
            doc_idx = _build_doc_idx(documents, num_epochs, np_rng,
                                     separate_last_epoch)
            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
            print(' > elasped time to build and save doc-idx mapping '
                  '(seconds): {:4f}'.format(time.time() - start_time))
            # sample-idx. pos of each seq_len of data.
            start_time = time.time()
            assert doc_idx.dtype == np.int32
            assert sizes.dtype == np.int32

            from ppfleetx.data.data_tools.cpp import fast_index_map_helpers

            sample_idx = fast_index_map_helpers.build_sample_idx(
                sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch)
            # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
            #                                num_epochs, tokens_per_epoch)

            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
            print(' > elasped time to build and save sample-idx mapping '
                  '(seconds): {:4f}'.format(time.time() - start_time))

            # shuffle-idx.
            start_time = time.time()

            if separate_last_epoch:
                num_samples_ = num_samples_from_epochs_minus_one
            else:
                num_samples_ = sample_idx.shape[0] - 1

            # Shuffle all seq len data.
            shuffle_idx = _build_shuffle_idx(num_samples_,
                                             sample_idx.shape[0] - 1, np_rng)
            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
            print(' > elasped time to build and save shuffle-idx mapping'
                  ' (seconds): {:4f}'.format(time.time() - start_time))

    else:
        while True:
            if (not os.path.isfile(doc_idx_filename)) or \
               (not os.path.isfile(sample_idx_filename)) or \
               (not os.path.isfile(shuffle_idx_filename)):
                time.sleep(3)
            else:
                try:
                    np.load(
                        shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
                    break
                except Exception as e:
                    print(
                        "%s file is still writing or damaged, please wait a moment."
                        % shuffle_idx_filename)
                    time.sleep(3)

    # Restore random state
    np_rng.set_state(savedState)

    try:
        if paddle.distributed.get_world_size() > 1:
            if paddle.in_dynamic_mode():
                paddle.distributed.barrier()
    except AssertionError:
        pass

    # Load mappings.
    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
    shuffle_idx = np.load(
        shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
    return doc_idx, sample_idx, shuffle_idx


def _num_tokens(documents, lens):
    """Total number of tokens in the dataset."""
    return np.sum(lens[documents])


def _num_epochs(tokens_per_epoch, seq_length, num_samples):
    """Based on number of samples and sequence lenght, calculate how many
    epochs will be needed."""
    num_epochs = 0
    total_tokens = 0
    while True:
        num_epochs += 1
        total_tokens += tokens_per_epoch
        if ((total_tokens - 1) // seq_length) >= num_samples:
            return num_epochs


def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
    """
    Build an array with length = number-of-epochs * number-of-documents.
    Each index is mapped to a corresponding document.
    """
    if not separate_last_epoch or num_epochs == 1:
        doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
        doc_idx[:] = documents
        # The documents repeat num_epochs times.
        doc_idx = doc_idx.reshape(-1)
        doc_idx = doc_idx.astype(np.int32)
        np_rng.shuffle(doc_idx)
        return doc_idx

    doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False)
    doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
    return np.concatenate((doc_idx_first, doc_idx_last))


def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs,
                      tokens_per_epoch):
    """
    num_samples + 1, pos of bs data
    the distance between two points for sample idx is bs tokens.
    """
    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
    sample_idx = np.zeros([int(num_samples) + 1, 2], dtype=np.int32)

    sample_index = 0
    doc_idx_index = 0
    doc_offset = 0
    sample_idx[sample_index][0] = doc_idx_index
    sample_idx[sample_index][1] = doc_offset
    sample_index += 1
    while sample_index <= num_samples:
        remaining_seq_length = seq_length + 1
        while remaining_seq_length != 0:
            doc_id = doc_idx[doc_idx_index]
            doc_length = sizes[doc_id] - doc_offset
            remaining_seq_length -= doc_length
            if remaining_seq_length <= 0:
                doc_offset += (remaining_seq_length + doc_length - 1)
                remaining_seq_length = 0
            else:
                doc_idx_index += 1
                doc_offset = 0
        sample_idx[sample_index][0] = doc_idx_index
        sample_idx[sample_index][1] = doc_offset
        sample_index += 1

    return sample_idx


def _build_shuffle_idx(num_samples, total_size, np_rng):
    dtype_ = np.uint32
    if total_size >= (np.iinfo(np.uint32).max - 1):
        dtype_ = np.int64

    shuffle_idx_first = np.arange(
        start=0, stop=num_samples, step=1, dtype=dtype_)
    np_rng.shuffle(shuffle_idx_first)
    if num_samples == total_size:
        return shuffle_idx_first

    shuffle_idx_last = np.arange(
        start=num_samples, stop=total_size, step=1, dtype=dtype_)
    np_rng.shuffle(shuffle_idx_last)

    return np.concatenate((shuffle_idx_first, shuffle_idx_last))


class LM_Eval_Dataset(paddle.io.Dataset):
    def __init__(self,
                 input_dir,
                 max_seq_len,
                 overlapping_eval=None,
                 model_type="GPT",
                 **kwargs):
        tokenizer_class, pretrained_name = MODEL_CLASSES[model_type]
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        with open(input_dir, "rb") as reader:
            entire_data = reader.read().decode('utf-8')

        self.num_original_tokens = len(entire_data.strip().split(" "))
        entire_data = self._wikitext_detokenizer(entire_data)
        self.tokens = tokenizer.encode(entire_data)
        self.num_tokenized_tokens = len(self.tokens)
        print('Original Tokens: %d, Detokenized tokens: %d' %
              (self.num_original_tokens, self.num_tokenized_tokens))

        self.seq_len = max_seq_len
        self.pad_idx = tokenizer.eos_token_id
        self.overlapping_eval = overlapping_eval
        if self.overlapping_eval is None:
            self.overlapping_eval = self.seq_len
        self.overlapping_eval = max(1, self.overlapping_eval)

        self.total_targets = len(self.tokens) - 1
        # remove first sequence tokens
        targets = max(self.total_targets - self.overlapping_eval, 0)
        self.total_sequences = max(
            math.ceil(targets / self.overlapping_eval) + 1, 1)

    def __len__(self):
        return self.total_sequences

    def _construct_sample(self, tokens):
        tokens = np.array(tokens).astype("int64").tolist()
        labels = tokens[1:]
        tokens = tokens[:-1]
        seq_length = len(tokens)
        # attention mask for the attention calulate
        attention_mask = np.tri(seq_length, seq_length).reshape(
            (1, seq_length, seq_length))

        # the pad and eos tokens do not contribute the loss
        loss_mask = np.ones(seq_length, dtype="float32")
        loss_mask[tokens == self.pad_idx] = 0.0
        position_ids = np.arange(0, seq_length, dtype="int64")

        # -INF mask value as default
        # attention_mask = (attention_mask - 1.0) * 1e9
        # Bool mask of attention
        attention_mask = attention_mask.astype("float32")
        return [tokens, loss_mask, attention_mask, position_ids, labels]

    def __getitem__(self, idx):
        start_idx = idx * self.overlapping_eval
        end_idx = start_idx + self.seq_len
        tokens = self.tokens[start_idx:end_idx + 1]
        num_tokens = len(tokens)
        if num_tokens < self.seq_len + 1:
            num_pad = (self.seq_len + 1 - num_tokens)
            tokens += [self.pad_idx] * num_pad
        [tokens, loss_mask, attention_mask, position_ids,
         labels] = self._construct_sample(tokens)
        if self.overlapping_eval != self.seq_len and idx != 0:
            loss_mask[:-self.overlapping_eval] *= 0

        return [tokens, loss_mask, attention_mask, position_ids, labels, \
            np.array([self.num_original_tokens, self.num_tokenized_tokens])]

    def _wikitext_detokenizer(self, string):
        # contractions
        string = string.replace("s '", "s'")
        string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
        # number separators
        string = string.replace(" @-@ ", "-")
        string = string.replace(" @,@ ", ",")
        string = string.replace(" @.@ ", ".")
        # punctuation
        string = string.replace(" : ", ": ")
        string = string.replace(" ; ", "; ")
        string = string.replace(" . ", ". ")
        string = string.replace(" ! ", "! ")
        string = string.replace(" ? ", "? ")
        string = string.replace(" , ", ", ")
        # double brackets
        string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
        string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
        string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
        string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
        string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
        # miscellaneous
        string = string.replace("= = = =", "====")
        string = string.replace("= = =", "===")
        string = string.replace("= =", "==")
        string = string.replace(" " + chr(176) + " ", chr(176))
        string = string.replace(" \n", "\n")
        string = string.replace("\n ", "\n")
        string = string.replace(" N ", " 1 ")
        string = string.replace(" 's", "'s")
        return string


class Lambada_Eval_Dataset(paddle.io.Dataset):
    def __init__(self, input_dir, max_seq_len, model_type="GPT", **kwargs):
        tokenizer_class, pretrained_name = MODEL_CLASSES[model_type]
        tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        tokenized_data = []
        tokenized_label = []
        with open(input_dir, 'r') as f:
            for line in f.readlines():
                text = json.loads(line)['text']
                tokens, labels = self._get_tokens(tokenizer, text)
                tokenized_data.append(tokens)
                tokenized_label.append(labels)

        self.pad_idx = tokenizer.eos_token_id
        self.seq_len = max_seq_len
        self.tokens = tokenized_data
        self.labels = tokenized_label

    def __len__(self):
        return len(self.tokens)

    def _construct_sample(self, tokens):
        tokens = np.array(tokens).astype("int64").tolist()
        labels = tokens[1:]
        tokens = tokens[:-1]

        seq_length = len(tokens)
        # attention mask for the attention calulate
        attention_mask = np.tri(seq_length, seq_length).reshape(
            (1, seq_length, seq_length))

        # the pad and eos tokens do not contribute the loss
        position_ids = np.arange(0, seq_length, dtype="int64")

        # -INF mask value as default
        #attention_mask = (attention_mask - 1.0) * 1e9
        # Bool mask of attention
        attention_mask = attention_mask.astype("float32")
        return [tokens, attention_mask, position_ids, labels]

    def __getitem__(self, idx):
        tokens = self.tokens[idx][:self.seq_len]
        labels = self.labels[idx]
        tokens = tokens + labels
        num_tokens = len(tokens)
        if num_tokens < self.seq_len + 1:
            num_pad = (self.seq_len + 1 - num_tokens)
            tokens += [self.pad_idx] * num_pad
        loss_mask = np.zeros(self.seq_len, dtype="float32")
        loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1.
        [tokens, attention_mask, position_ids,
         labels] = self._construct_sample(tokens)
        return [
            tokens, loss_mask, attention_mask, position_ids, labels,
            np.array([self.__len__()])
        ]

    def _get_tokens(self, tokenizer, text, strict=True):
        if not strict:
            tokens = tokenizer.encode(text)
            return tokens[:-1], [tokens[-1]]
        last_token = text.split()[-1]
        start_idx = text.rfind(last_token)
        beginning_tokens = tokenizer.encode(text[:start_idx].strip())
        last_token = tokenizer.encode(' ' + last_token)
        return beginning_tokens, last_token


================================================
FILE: ppfleetx/data/dataset/multimodal_dataset.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import gzip

import random
import base64
import numpy as np
import blobfile as bf

from random import randint, choice
from tqdm import tqdm
from io import BytesIO
from pathlib import Path
from copy import deepcopy
import PIL
from PIL import Image, ImageFile

import paddle
from paddle.io import Dataset, DataLoader
from paddle.distributed import get_world_size
from paddle.vision import transforms as T

from ppfleetx.utils.log import logger


def get_keys(data_path, gpu_num):
    files = [
        file.strip() for file in open(data_path).readlines()
        if file.strip() != ""
    ]
    local_rank = paddle.distributed.get_rank()

    if len(files) % gpu_num == 0:
        keys_extend = list(files)
    else:
        added_num = gpu_num - (len(files) % gpu_num)
        try:
            keys_extend = files + random.sample(files, added_num)
        except:
            keys_extend = files + random.sample(files, 1) * added_num

    keys = keys_extend[local_rank::gpu_num]
    logger.info("keys: {} {}".format(keys, local_rank))

    return keys


class ImagenDataset(Dataset):
    def __init__(self,
                 input_path,
                 image_format='base64',
                 shuffle=False,
                 image_size=64,
                 text_max_len=128,
                 filter_image_resolution=128,
                 tokenizer=None,
                 sr=False,
                 split='train',
                 interpolation="bicubic",
                 flip_p=0.5):
        super().__init__()
        device_world_size = paddle.distributed.get_world_size()
        self.filename = get_keys(input_path, gpu_num=device_world_size)
        if shuffle:
            random.shuffle(self.filename)
        self.filter_image_resolution = filter_image_resolution
        self.text_max_len = text_max_len
        self.split = split
        self.tokenizer = tokenizer
        self.sr = sr
        if sr:
            self.transform = T.Compose([T.Resize(image_size), T.ToTensor()])

        self.for_line = self.get_line_for_line(self.filename).__iter__()

        self.good_index = []

        self.interpolation = {
            "linear": PIL.Image.LINEAR,
            "bilinear": PIL.Image.BILINEAR,
            "bicubic": PIL.Image.BICUBIC,
            "lanczos": PIL.Image.LANCZOS,
        }[interpolation]
        self.flip = T.RandomHorizontalFlip(prob=flip_p)
        self.image_size = image_size

    def load_path(self, data_path, f_index=None):
        if f_index is None:
            offset = 0
            with open(data_path, 'rb') as f:
                for line in tqdm(f, desc='Loading data'):
                    self.indexes.append((offset, len(line)))
                    offset += len(line)
        else:
            offset = 0
            with open(data_path, 'rb') as f:
                for line in tqdm(f, desc='Loading data'):
                    self.indexes.append(((offset, len(line)), f_index))
                    offset += len(line)

        if self.split == 'train':
            random.shuffle(self.indexes)
        return

    @staticmethod
    def base64_to_image(base64_str):
        byte_data = base64.b64decode(base64_str)
        image_data = BytesIO(byte_data)
        img = Image.open(image_data)
        if img.mode != 'RGB':
            img = img.convert('RGB')
        return img

    def get_line_for_line(self, filename):
        while True:
            for fname in filename:
                if fname[-2:] != "gz":
                    file = open(fname)
                    for line in file:
                        if line != "":
                            data = line.strip().split('\t')
                            image_base64 = data[4]
                            image_item = self.base64_to_image(image_base64)
                            if min(image_item.size) >= self.image_size:
                                yield line
                else:
                    file = gzip.GzipFile(fname, "r")
                    for line in file:
                        if line != "":
                            line = line.decode()
                            data = line.strip().split('\t')
                            image_base64 = data[4]
                            image_item = self.base64_to_image(image_base64)
                            if min(image_item.size) >= self.image_size:
                                yield line

    def __getitem__(self, index):
        if not isinstance(self.filename, list):
            data = self.for_line.__next__()
        else:
            data = self.for_line.__next__()

        data = data.strip().split('\t')

        # For laion 400m
        if len(data) == 6:
            image_base64 = data[4]
            caption = data[2]

        image_item = self.base64_to_image(image_base64)

        # Filter image resolution
        if min(image_item.size) < self.filter_image_resolution:
            return None

        if not self.sr:
            self.transform = T.Compose([
                T.CenterCrop([min(image_item.size), min(image_item.size)]),
                T.Resize(64), T.ToTensor()
            ])
            image_item = self.transform(image_item)
        else:
            img = np.array(image_item).astype(np.uint8)

            crop = min(img.shape[0], img.shape[1])
            h, w, = img.shape[0], img.shape[1]

            if img.shape[0] > img.shape[1]:
                img = img[0:crop, (w - crop) // 2:(w + crop) // 2]
            else:
                img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(
                    w + crop) // 2]

            image = Image.fromarray(img)
            image = image.resize(
                (self.image_size, self.image_size),
                resample=self.interpolation)

            image_item = self.transform(image)

        example = {'id': index, 'image': image_item, 'caption': caption}
        return example

    def __len__(self):
        #return len(self.indexes)
        if self.sr:
            return 300000000
        return 5000000


================================================
FILE: ppfleetx/data/dataset/vision_dataset.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import os.path
import copy
import numpy as np
from typing import Any, Callable, cast, Dict, List, Optional, Tuple

import paddle
from ppfleetx.utils.log import logger
from ppfleetx.data.transforms.utils import create_preprocess_operators, transform

__all__ = [
    "GeneralClsDataset",
    "ImageFolder",
    "CIFAR10",
    "ContrativeLearningDataset",
]


class GeneralClsDataset(paddle.io.Dataset):
    def __init__(self,
                 image_root,
                 cls_label_path,
                 transform_ops=None,
                 delimiter=" ",
                 multi_label=False,
                 class_num=None):
        if multi_label:
            assert class_num is not None, "Must set class_num when multi_label=True"
        self.multi_label = multi_label
        self.classes_num = class_num

        self._img_root = image_root
        self._cls_path = cls_label_path
        self.delimiter = delimiter
        self._transform_ops = None
        if transform_ops:
            self._transform_ops = create_preprocess_operators(transform_ops)

        self.images = []
        self.labels = []
        self._load_anno()

    def _load_anno(self):
        assert os.path.exists(
            self._cls_path), f"{self._cls_path} does not exists"
        assert os.path.exists(
            self._img_root), f"{self._img_root} does not exists"
        self.images = []
        self.labels = []

        with open(self._cls_path) as fd:
            lines = fd.readlines()
            for l in lines:
                l = l.strip().split(self.delimiter)
                self.images.append(os.path.join(self._img_root, l[0]))
                if self.multi_label:
                    self.labels.append(l[1])
                else:
                    self.labels.append(np.int32(l[1]))
                assert os.path.exists(self.images[
                    -1]), f"{self.images[-1]} does not exists"

    def __getitem__(self, idx):
        try:
            with open(self.images[idx], 'rb') as f:
                img = f.read()
            if self._transform_ops:
                img = transform(img, self._transform_ops)
            if self.multi_label:
                one_hot = np.zeros([self.classes_num], dtype=np.float32)
                cls_idx = [int(e) for e in self.labels[idx].split(',')]
                for idx in cls_idx:
                    one_hot[idx] = 1.0
                return (img, one_hot)
            else:
                return (img, np.int32(self.labels[idx]))

        except Exception as ex:
            logger.error("Exception occured when parse line: {} with msg: {}".
                         format(self.images[idx], ex))
            rnd_idx = np.random.randint(self.__len__())
            return self.__getitem__(rnd_idx)

    def __len__(self):
        return len(self.images)

    @property
    def class_num(self):
        if self.multi_label:
            return self.classes_num
        return len(set(self.labels))


IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif",
                  ".tiff", ".webp")


class ImageFolder(paddle.io.Dataset):
    """ Code ref from https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py
    
    A generic data loader where the images are arranged in this way by default: ::

        root/dog/xxx.png
        root/dog/xxy.png
        root/dog/[...]/xxz.png

        root/cat/123.png
        root/cat/nsdf3.png
        root/cat/[...]/asd932_.png

    This class inherits from :class:`~torchvision.datasets.DatasetFolder` so
    the same methods can be overridden to customize the dataset.

    Args:
        root (string): Root directory path.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        loader (callable, optional): A function to load an image given its path.
        is_valid_file (callable, optional): A function that takes path of an Image file
            and check if the file is a valid file (used to check of corrupt files)

     Attributes:
        classes (list): List of the class names sorted alphabetically.
        class_to_idx (dict): Dict with items (class_name, class_index).
        imgs (list): List of (image path, class_index) tuples
    """

    def __init__(self, root, extensions=IMG_EXTENSIONS, transform_ops=None):

        self.root = root
        classes, class_to_idx = self.find_classes(self.root)
        samples = self.make_dataset(self.root, class_to_idx, extensions)
        logger.info(
            f'find total {len(classes)} classes and {len(samples)} images.')

        self.extensions = extensions

        self.classes = classes
        self.class_to_idx = class_to_idx
        self.imgs = samples
        self.targets = [s[1] for s in samples]

        self._transform_ops = None
        if transform_ops:
            self._transform_ops = create_preprocess_operators(transform_ops)

    @staticmethod
    def make_dataset(
            directory,
            class_to_idx,
            extensions=None,
            is_valid_file=None, ):
        """Generates a list of samples of a form (path_to_sample, class).

        Args:
            directory (str): root dataset directory, corresponding to ``self.root``.
            class_to_idx (Dict[str, int]): Dictionary mapping class name to class index.
            extensions (optional): A list of allowed extensions.
                Either extensions or is_valid_file should be passed. Defaults to None.
            is_valid_file (optional): A function that takes path of a file
                and checks if the file is a valid file
                (used to check of corrupt files) both extensions and
                is_valid_file should not be passed. Defaults to None.

        Raises:
            ValueError: In case ``class_to_idx`` is empty.
            ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.
            FileNotFoundError: In case no valid file was found for any class.

        Returns:
            List[Tuple[str, int]]: samples of a form (path_to_sample, class)
        """
        if class_to_idx is None:
            # prevent potential bug since make_dataset() would use the class_to_idx logic of the
            # find_classes() function, instead of using that of the find_classes() method, which
            # is potentially overridden and thus could have a different logic.
            raise ValueError("The class_to_idx parameter cannot be None.")

        directory = os.path.expanduser(directory)

        both_none = extensions is None and is_valid_file is None
        both_something = extensions is not None and is_valid_file is not None
        if both_none or both_something:
            raise ValueError(
                "Both extensions and is_valid_file cannot be None or not None at the same time"
            )

        if extensions is not None:

            def is_valid_file(filename: str) -> bool:
                return filename.lower().endswith(
                    extensions
                    if isinstance(extensions, str) else tuple(extensions))

        is_valid_file = cast(Callable[[str], bool], is_valid_file)

        instances = []
        available_classes = set()
        for target_class in sorted(class_to_idx.keys()):
            class_index = class_to_idx[target_class]
            target_dir = os.path.join(directory, target_class)
            if not os.path.isdir(target_dir):
                continue
            for root, _, fnames in sorted(
                    os.walk(
                        target_dir, followlinks=True)):
                for fname in sorted(fnames):
                    path = os.path.join(root, fname)
                    if is_valid_file(path):
                        item = path, class_index
                        instances.append(item)

                        if target_class not in available_classes:
                            available_classes.add(target_class)

        empty_classes = set(class_to_idx.keys()) - available_classes
        if empty_classes:
            msg = f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. "
            if extensions is not None:
                msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}"
            raise FileNotFoundError(msg)

        return instances

    def find_classes(self, directory):
        """Find the class folders in a dataset structured as follows::

            directory/
            ├── class_x
            │   ├── xxx.ext
            │   ├── xxy.ext
            │   └── ...
            │       └── xxz.ext
            └── class_y
                ├── 123.ext
                ├── nsdf3.ext
                └── ...
                └── asd932_.ext

        This method can be overridden to only consider
        a subset of classes, or to adapt to a different dataset directory structure.

        Args:
            directory(str): Root directory path, corresponding to ``self.root``

        Raises:
            FileNotFoundError: If ``dir`` has no class folders.

        Returns:
            (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index.
        """

        classes = sorted(
            entry.name for entry in os.scandir(directory) if entry.is_dir())
        if not classes:
            raise FileNotFoundError(
                f"Couldn't find any class folder in {directory}.")

        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def __getitem__(self, idx):
        try:
            path, target = self.imgs[idx]
            with open(path, 'rb') as f:
                img = f.read()
            if self._transform_ops:
                img = transform(img, self._transform_ops)

            return (img, np.int32(target))

        except Exception as ex:
            logger.error("Exception occured when parse line: {} with msg: {}".
                         format(path, ex))
            rnd_idx = np.random.randint(self.__len__())
            return self.__getitem__(rnd_idx)

    def __len__(self) -> int:
        return len(self.imgs)

    @property
    def class_num(self):
        return len(set(self.classes))


class CIFAR10(paddle.io.Dataset):
    def __init__(
            self,
            root,
            mode='train',
            transform_ops=None, ):
        self.root = root
        self.mode = mode
        assert self.mode in ['train', 'test']
        self._transform_ops = None

        self.URL = 'https://dataset.bj.bcebos.com/cifar/cifar-10-python.tar.gz'

        if transform_ops:
            self._transform_ops = create_preprocess_operators(transform_ops)

        if not os.path.exists(os.path.join(self.root, f'data_batch_1')):
            from ppfleetx.utils.download import cached_path
            from ppfleetx.utils.file import untar
            zip_path = cached_path(
                self.URL, cache_dir=os.path.abspath(self.root))
            untar(
                zip_path,
                mode="r:gz",
                out_dir=os.path.join(self.root, '..'),
                delete=True)

        # wait to download dataset
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.barrier()

        self.images = []
        self.labels = []
        self._load_anno()

    def _load_anno(self):
        def unpickle(file):
            import pickle
            with open(file, 'rb') as fo:
                dict = pickle.load(fo, encoding='bytes')
            return dict

        if self.mode == 'train':
            for idx in range(1, 6):
                path = os.path.join(self.root, f'data_batch_{idx}')
                ret = unpickle(path)
                data = ret[b'data']
                labels = ret[b'labels']
                for i in range(len(data)):
                    img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0))
                    self.images.append(img)
                    self.labels.append(labels[i])
        else:
            path = os.path.join(self.root, f'test_batch')
            ret = unpickle(path)
            data = ret[b'data']
            labels = ret[b'labels']
            for i in range(len(data)):
                img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0))
                self.images.append(img)
                self.labels.append(labels[i])

    def __getitem__(self, idx):
        img = self.images[idx]
        if self._transform_ops:
            img = transform(img, self._transform_ops)

        return (img, np.int32(self.labels[idx]))

    def __len__(self):
        return len(self.images)

    @property
    def class_num(self):
        return len(set(self.labels))


class ContrativeLearningDataset(ImageFolder):
    """ Code ref from https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py
    
    A generic data loader where the images are arranged in this way by default: ::

        root/dog/xxx.png
        root/dog/xxy.png
        root/dog/[...]/xxz.png

        root/cat/123.png
        root/cat/nsdf3.png
        root/cat/[...]/asd932_.png
    """

    def __init__(self, root, extensions=IMG_EXTENSIONS, transform_ops=None):
        super(ContrativeLearningDataset, self).__init__(
            root, extensions=extensions, transform_ops=transform_ops)

        # remove unused attr
        del self.classes
        del self.class_to_idx
        del self.targets
        # only use image path
        self.imgs = [s[0] for s in self.imgs]

    def __getitem__(self, idx):
        try:
            path = self.imgs[idx]
            with open(path, 'rb') as f:
                img = f.read()
            if self._transform_ops:
                img1 = transform(img, self._transform_ops)
                img2 = transform(img, self._transform_ops)

            return img1, img2

        except Exception as ex:
            logger.error("Exception occured when parse line: {} with msg: {}".
                         format(path, ex))
            rnd_idx = np.random.randint(self.__len__())
            return self.__getitem__(rnd_idx)

    def __len__(self) -> int:
        return len(self.imgs)

    @property
    def class_num(self):
        raise NotImplementedError


================================================
FILE: ppfleetx/data/sampler/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .batch_sampler import *
from .collate import Stack, Pad, Tuple, Dict


================================================
FILE: ppfleetx/data/sampler/batch_sampler.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
from __future__ import division

import os
import sys
import numpy as np
import math

import paddle
from paddle.io import DistributedBatchSampler

from ppfleetx.distributed.apis import env

__all__ = ["GPTBatchSampler", "DistributedBatchSampler"]


class GPTBatchSampler(paddle.io.BatchSampler):
    """Sampler that restricts data loading to a subset of the dataset.
    In such case, each process can pass a DistributedBatchSampler instance 
    as a DataLoader sampler, and load a subset of the original dataset that 
    is exclusive to it.
    .. note::
        Dataset is assumed to be of constant size.
        
    Args:
        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
                     or other python object which implemented
                     `__len__` for BatchSampler to get sample
                     number of data source.
        batch_size(int): sample indice number in a mini-batch indices.
        num_replicas(int, optional): porcess number in distributed training.
            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
            retrieved from :code:`paddle.distributed.ParallenEnv`.
            Default None.
        rank(int, optional): the rank of the current process among :attr:`num_replicas`
            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
            :code:`paddle.distributed.ParallenEnv`. Default None.
        shuffle(bool): whther to shuffle indices order before genrating
            batch indices. Default False.
        drop_last(bool): whether drop the last incomplete batch dataset size
            is not divisible by the batch size. Default False
    Examples:
        .. code-block:: python
            import numpy as np
            from paddle.io import Dataset, DistributedBatchSampler
            # init with dataset
            class RandomDataset(Dataset):
                def __init__(self, num_samples):
                    self.num_samples = num_samples
            
                def __getitem__(self, idx):
                    image = np.random.random([784]).astype('float32')
                    label = np.random.randint(0, 9, (1, )).astype('int64')
                    return image, label
                
                def __len__(self):
                    return self.num_samples
  
            dataset = RandomDataset(100)
            sampler = DistributedBatchSampler(dataset, batch_size=64)
            for data in sampler:
                # do something
                break
    """

    def __init__(self,
                 dataset,
                 batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=False,
                 drop_last=False,
                 consumed_samples=0):
        self.dataset = dataset

        assert isinstance(batch_size, int) and batch_size > 0, \
                "batch_size should be a positive integer"
        self.batch_size = batch_size
        assert isinstance(shuffle, bool), \
                "shuffle should be a boolean value"
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
                "drop_last should be a boolean number"

        from paddle.distributed import ParallelEnv

        if num_replicas is not None:
            assert isinstance(num_replicas, int) and num_replicas > 0, \
                    "num_replicas should be a positive integer"
            self.nranks = num_replicas
        else:
            self.nranks = env.get_data_world_size()

        if rank is not None:
            assert isinstance(rank, int) and rank >= 0, \
                    "rank should be a non-negative integer"
            self.local_rank = rank
        else:
            self.local_rank = env.get_data_world_rank()

        self.drop_last = drop_last
        self.epoch = 0

        self.consumed_samples = consumed_samples
        self.num_samples = int(
            math.ceil(len(self.dataset) * 1.0 / self.nranks))
        self.total_size = self.num_samples * self.nranks

    def get_start_end_idx(self):
        start_idx = self.local_rank * self.batch_size
        end_idx = start_idx + self.batch_size
        return start_idx, end_idx

    def __iter__(self):
        assert self.consumed_samples % self.nranks == 0, \
            "The consumed_samples should be divided by nranks. consumed_samples=%d, nranks=%s" % (
            self.consumed_samples, self.nranks)
        self.remain_num_samples = int(
            math.ceil((len(self.dataset) - self.consumed_samples) * 1.0 /
                      self.nranks))
        self.remain_total_size = self.remain_num_samples * self.nranks
        self.batch_size_times_rank_size = self.batch_size * self.nranks

        num_samples = len(self.dataset)
        batch_indices = []
        for idx in range(self.consumed_samples, self.total_size):
            if idx >= num_samples:
                batch_indices.append(idx - num_samples)
            else:
                batch_indices.append(idx)
            if len(batch_indices) == self.batch_size_times_rank_size:
                start_idx, end_idx = self.get_start_end_idx()
                yield batch_indices[start_idx:end_idx]
                batch_indices = []
        if not self.drop_last and len(batch_indices) > 0:
            yield batch_indices

    def __len__(self):
        num_samples = self.num_samples
        num_samples += int(not self.drop_last) * (self.batch_size - 1)
        return num_samples // self.batch_size

    def set_epoch(self, epoch=0, consumed_samples=0):
        """
        Sets the epoch number. When :attr:`shuffle=True`, this number is used
        as seeds of random numbers. By default, users may not set this, all
        replicas (workers) use a different random ordering for each epoch.
        If set same number at each epoch, this sampler will yield the same
        ordering at all epoches.
        Arguments:
            epoch (int): Epoch number.
        Examples:
            .. code-block:: python
    
                from paddle.io import Dataset, DistributedBatchSampler
    
                # init with dataset
                class RandomDataset(Dataset):
                    def __init__(self, num_samples):
                        self.num_samples = num_samples
                
                    def __getitem__(self, idx):
                        image = np.random.random([784]).astype('float32')
                        label = np.random.randint(0, 9, (1, )).astype('int64')
                        return image, label
                    
                    def __len__(self):
                        return self.num_samples
      
                dataset = RandomDataset(100)
                sampler = DistributedBatchSampler(dataset, batch_size=64)
    
                for epoch in range(10):
                    sampler.set_epoch(epoch)
        """
        self.epoch = epoch
        # if we reset the epoch, the consumed_samples should be set to 0.
        self.consumed_samples = consumed_samples


================================================
FILE: ppfleetx/data/sampler/collate.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle

__all__ = [
    'Stack',
    'Pad',
    'Tuple',
    'Dict',
]


class Stack(object):
    """
    Stacks the input data samples to construct the batch. The N input samples
    must have the same shape/length and will be stacked to construct a batch.
    Args:
        axis (int, optional): The axis in the result data along which the input
            data are stacked. Default: 0.
        dtype (str|numpy.dtype, optional): The value type of the output. If it
            is set to None, the type of input data is used. Default: None.
    """

    def __init__(self, axis=0, dtype=None):
        self._axis = axis
        self._dtype = dtype

    def __call__(self, data):
        """
        Batchifies the input data by stacking.
        Args:
            data (list[numpy.ndarray]): The input data samples. It is a list. 
                Each element is a numpy.ndarray or list.
        Returns:
            numpy.ndarray: Stacked batch data.
        Example:
            .. code-block:: python
                from paddlenlp.data import Stack
                a = [1, 2, 3, 4]
                b = [3, 4, 5, 6]
                c = [5, 6, 7, 8]
                result = Stack()([a, b, c])
                '''
                [[1, 2, 3, 4],
                 [3, 4, 5, 6],
                 [5, 6, 7, 8]]
                '''
        """
        data = np.stack(
            data,
            axis=self._axis).astype(self._dtype) if self._dtype else np.stack(
                data, axis=self._axis)
        return data


class Pad(object):
    """
    Pads the input data samples to the largest length at `axis`.
    Args:
        pad_val (float|int, optional): The padding value. Default: 0.
        axis (int, optional): The axis to pad the arrays. The arrays will be
            padded to the largest length at `axis`. For example, assume the 
            input arrays have shape (10, 8, 5), (6, 8, 5), (3, 8, 5) and the 
            axis is 0. Each input will be padded into (10, 8, 5) and then 
            stacked to form the final output, which has shape (3, 10, 8, 5). 
            Default: 0.
        ret_length (bool|numpy.dtype, optional): If it is bool, indicate whether
            to return the valid length in the output, and the data type of
            returned length is int32 if True. If it is numpy.dtype, indicate the
            data type of returned length. Default: None.
        dtype (numpy.dtype, optional): The value type of the output. If it is
            set to None, the input data type is used. Default: None.
        pad_right (bool, optional): Whether the padding direction is right-side. 
            If True, it indicates we pad to the right side, while False indicates 
            we pad to the left side. Default: True.
     """

    def __init__(self,
                 pad_val=0,
                 axis=0,
                 ret_length=None,
                 dtype=None,
                 pad_right=True):
        self._pad_val = pad_val
        self._axis = axis
        self._ret_length = ret_length
        self._dtype = dtype
        self._pad_right = pad_right

    def __call__(self, data):
        """
        Batchifies the input data by padding. The input will be padded to the 
        largest dimension at `axis` and then stacked to form the final output. 
        In addition, the function will output the original dimensions at the 
        `axis` if `ret_length` is not None or False.
        Args:
            data (list[numpy.ndarray|list]): The input data samples. It is a 
                list. Each element is a numpy.ndarray or list.
        Returns:
            numpy.ndarray|tuple[numpy.ndarray]: If `ret_length` is False, it 
            is a numpy.ndarray representing the padded batch data and the 
            shape is (N, …). Otherwise, it is a tuple, besides the padded batch 
            data, the tuple also includes a numpy.ndarray representing original 
            length at `axis` of all input samples, which shaped `(N,)`. 
        Example:
            .. code-block:: python
                from paddlenlp.data import Pad
                a = [1, 2, 3, 4]
                b = [5, 6, 7]
                c = [8, 9]
                result = Pad(pad_val=0)([a, b, c])
                '''
                [[1, 2, 3, 4],
                 [5, 6, 7, 0],
                 [8, 9, 0, 0]]
                '''
        """

        # return data itself for rare unexpected cases when 1-D array is passed to Pad
        if not isinstance(data[0], list) and not isinstance(data[0],
                                                            np.ndarray):
            return np.asarray(
                data,
                dtype=self._dtype if self._dtype is not None else np.int64)

        arrs = [np.asarray(ele) for ele in data]
        original_length = [ele.shape[self._axis] for ele in arrs]
        max_size = max(original_length)
        ret_shape = list(arrs[0].shape)
        ret_shape[self._axis] = max_size
        ret_shape = (len(arrs), ) + tuple(ret_shape)
        ret = np.full(
            shape=ret_shape,
            fill_value=self._pad_val,
            dtype=arrs[0].dtype if self._dtype is None else self._dtype)
        for i, arr in enumerate(arrs):
            if arr.shape[self._axis] == max_size:
                ret[i] = arr
            else:
                slices = [slice(None) for _ in range(arr.ndim)]
                if self._pad_right:
                    slices[self._axis] = slice(0, arr.shape[self._axis])
                else:
                    slices[self._axis] = slice(
                        max_size - arr.shape[self._axis], max_size)

                if slices[self._axis].start != slices[self._axis].stop:
                    slices = [slice(i, i + 1)] + slices
                    ret[tuple(slices)] = arr
        if self._ret_length:
            return ret, np.asarray(
                original_length,
                dtype="int32") if self._ret_length == True else np.asarray(
                    original_length, self._ret_length)
        else:
            return ret


class Tuple(object):
    """
    Wraps multiple batchify functions together. The input functions will be applied
    to the corresponding input fields.
    
    Each sample should be a list or tuple containing multiple fields. The i'th
    batchify function stored in Tuple will be applied on the i'th field. 
    
    For example, when data sample is (nd_data, label), you can wrap two batchify
    functions using `Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and
    label correspondingly.
    Args:
        fn (callable|list[callable]|tuple[callable]): The batchify functions to 
            wrap. It is a callable function or a list/tuple of callable functions.
        args (tuple[callable]): The additional batchify functions to wrap.
    """

    def __init__(self, fn, *args):
        if isinstance(fn, (list, tuple)):
            assert len(args) == 0, 'Input pattern not understood. The input of Tuple can be ' \
                                   'Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). ' \
                                   'Received fn=%s, args=%s' % (str(fn), str(args))
            self._fn = fn
        else:
            self._fn = (fn, ) + args
        for i, ele_fn in enumerate(self._fn):
            assert callable(
                ele_fn
            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
                i, str(type(ele_fn)))

    def __call__(self, data):
        """
        Batchifies data samples by applying each function on the corresponding 
        data field, and each data field is produced by stacking the field data 
        of samples.
        Args:
            data (list|tuple): The samples to batchfy. Each sample in list/tuple
                should contain `N` fields.
        Returns:
            tuple: A tuple composed of results from all including batchifying 
            functions.
        Example:
            .. code-block:: python
                
                from paddlenlp.data import Stack, Pad, Tuple
                data = [
                        [[1, 2, 3, 4], [1]],
                        [[5, 6, 7], [0]],
                        [[8, 9], [1]],
                       ]
                batchify_fn = Tuple(Pad(pad_val=0), Stack())
                ids, label = batchify_fn(data)
                '''
                ids:
                [[1, 2, 3, 4],
                [5, 6, 7, 0],
                [8, 9, 0, 0]]
                label: [[1], [0], [1]]
                '''
        """

        assert len(data[0]) == len(self._fn),\
            'The number of attributes in each data sample should contain' \
            ' {} elements'.format(len(self._fn))
        ret = []
        for i, ele_fn in enumerate(self._fn):
            result = ele_fn([ele[i] for ele in data])
            if isinstance(result, (tuple, list)):
                ret.extend(result)
            else:
                ret.append(result)
        return tuple(ret)


class Dict(object):
    """
    Wraps multiple batchify functions together. The input functions will be 
    applied to the corresponding input fields.
    
    Each sample should be a dict containing multiple fields. Each batchify 
    function with key stored in `Dict` will be applied on the field which has 
    the same key. 
    
    For example, when data sample is {'tokens': tokens, 'labels': labels}, you 
    can wrap two batchify functions using 
    `Dict({'tokens': DataBatchify, 'labels': LabelBatchify})` to batchify tokens 
    and labels correspondingly.
    Args:
        fn (dict): The batchify functions to wrap. It is a dict, which values is 
            callable functions.
    """

    def __init__(self, fn):
        assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \
                                   'Received fn=%s' % (str(fn))

        self._fn = fn

        for col_name, ele_fn in self._fn.items():
            assert callable(
                ele_fn
            ), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
                col_name, str(type(ele_fn)))

    def __call__(self, data):
        """
        Batchifies data samples by applying each function on the corresponding 
        data field, and each data field is produced by stacking the field data 
        with the same key as batchify functions of all samples.
        Args:
            data (list[dict]|tuple[dict]): The samples to batchfy. Each sample 
                in list/tuple is a dict with `N` key-values.
                
        Returns:
            tuple: A tuple composed of results from all including batchifying 
            functions.
            
        Example:
            .. code-block:: python
                from paddlenlp.data import Stack, Pad, Dict
                data = [
                        {'labels':[1], 'token_ids':[1, 2, 3, 4]},
                        {'labels':[0], 'token_ids':[5, 6, 7]},
                        {'labels':[1], 'token_ids':[8, 9]},
                       ]
                batchify_fn = Dict({'token_ids':Pad(pad_val=0), 'labels':Stack()})
                ids, label = batchify_fn(data)
                '''
                ids:
                [[1, 2, 3, 4],
                [5, 6, 7, 0],
                [8, 9, 0, 0]]
                label: [[1], [0], [1]]
                '''
        """

        ret = []
        for col_name, ele_fn in self._fn.items():
            result = ele_fn([ele[col_name] for ele in data])
            if isinstance(result, (tuple, list)):
                ret.extend(result)
            else:
                ret.append(result)
        return tuple(ret)


================================================
FILE: ppfleetx/data/tokenizers/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .gpt_tokenizer import GPTTokenizer, GPTChineseTokenizer
from .ernie_tokenizer import get_ernie_tokenizer
from .t5_tokenizer import get_t5_tokenizer
from .debertav2_tokenizer import get_debertav2_tokenizer


================================================
FILE: ppfleetx/data/tokenizers/debertav2_tokenizer.py
================================================
# coding=utf-8
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization for DebertaV2."""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import os
import json
import copy
import logging
import warnings
import regex as re
import unicodedata
import sentencepiece as sp
from collections import OrderedDict, UserDict
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

from ppfleetx.utils.download import cached_path
from ppfleetx.data.tokenizers.tokenization_utils_base import (
    _LazyConfigMapping, AddedToken, TruncationStrategy, PaddingStrategy,
    BatchEncoding, SpecialTokensMixin)

logger = logging.getLogger(__name__)

MAX_LENGTH = 256

DEFAULT_DebertaV2_NAME = "projects/imagen/cache/deberta-v-xxlarge"

# Slow tokenizers used to be saved in three separated files
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
FULL_TOKENIZER_FILE = "tokenizer.json"
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")

CONFIG_NAME = "config.json"


def get_debertav2_tokenizer(name):
    tokenizer = DebertaV2Tokenizer.from_pretrained(name)
    return tokenizer


def debertav2_tokenize(texts, tokenizer):
    encoded = tokenizer.batch_encode_plus(
        texts,
        return_tensors="paddle",
        padding='longest',
        max_length=MAX_LENGTH,
        truncation=True)

    input_ids = encoded.input_ids
    attn_mask = encoded.attention_mask
    return input_ids, attn_mask


PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/deberta-v2-xlarge":
        "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model",
        "microsoft/deberta-v2-xxlarge":
        "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model",
        "microsoft/deberta-v2-xlarge-mnli":
        ("https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model"
         ),
        "microsoft/deberta-v2-xxlarge-mnli":
        ("https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model"
         ),
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/deberta-v2-xlarge": 512,
    "microsoft/deberta-v2-xxlarge": 512,
    "microsoft/deberta-v2-xlarge-mnli": 512,
    "microsoft/deberta-v2-xxlarge-mnli": 512,
}

PRETRAINED_INIT_CONFIGURATION = {
    "microsoft/deberta-v2-xlarge": {
        "do_lower_case": False
    },
    "microsoft/deberta-v2-xxlarge": {
        "do_lower_case": False
    },
    "microsoft/deberta-v2-xlarge-mnli": {
        "do_lower_case": False
    },
    "microsoft/deberta-v2-xxlarge-mnli": {
        "do_lower_case": False
    },
}

VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}


class DebertaV2Tokenizer(SpecialTokensMixin):
    r"""
    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.
        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. When building a sequence using special tokens, this is not the token that is
            used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
    padding_side = "right"
    truncation_side = "right"
    slow_tokenizer_class = None

    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 split_by_punct=False,
                 bos_token="[CLS]",
                 eos_token="[SEP]",
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 sp_model_kwargs=None,
                 **kwargs):
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        self.added_tokens_encoder: Dict[str, int] = {}
        self.added_tokens_decoder: Dict[int, str] = {}

        super().__init__(
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            split_by_punct=split_by_punct,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs, )

        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        self.do_lower_case = do_lower_case
        self.split_by_punct = split_by_punct
        self.vocab_file = vocab_file
        self._tokenizer = SPMTokenizer(
            vocab_file,
            split_by_punct=split_by_punct,
            sp_model_kwargs=self.sp_model_kwargs)

    def __len__(self):
        """
        Size of the full vocabulary with the added tokens.
        """
        return self.vocab_size + len(self.added_tokens_encoder)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs,
                        **kwargs):
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
        use_auth_token = kwargs.pop("use_auth_token", None)
        revision = kwargs.pop("revision", None)
        subfolder = kwargs.pop("subfolder", None)
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)
        commit_hash = kwargs.pop("_commit_hash", None)
        _raise_exceptions_for_missing_entries = False

        user_agent = {
            "file_type": "tokenizer",
            "from_auto_class": from_auto_class,
            "is_fast": "Fast" in cls.__name__
        }
        if from_pipeline is not None:
            user_agent["using_pipeline"] = from_pipeline

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        vocab_files = {}
        init_configuration = {}

        is_local = os.path.isdir(pretrained_model_name_or_path)
        single_file_id = None
        if os.path.isfile(
                pretrained_model_name_or_path
        ):  # or is_remote_url(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
                )
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning, )
            file_id = list(cls.vocab_files_names.keys())[0]

            vocab_files[file_id] = pretrained_model_name_or_path
            single_file_id = file_id
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
            }
            vocab_files = {
                ** cls.vocab_files_names, ** additional_files_names
            }

            if "tokenizer_file" in vocab_files:
                # Try to get the tokenizer config to see if there are versioned tokenizer files.
                fast_tokenizer_file = FULL_TOKENIZER_FILE
                resolved_config_file = cached_file(
                    pretrained_model_name_or_path,
                    TOKENIZER_CONFIG_FILE,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    resume_download=resume_download,
                    proxies=proxies,
                    use_auth_token=use_auth_token,
                    revision=revision,
                    local_files_only=local_files_only,
                    subfolder=subfolder,
                    user_agent=user_agent,
                    _raise_exceptions_for_missing_entries=False,
                    _raise_exceptions_for_connection_errors=False,
                    _commit_hash=commit_hash, )
                commit_hash = extract_commit_hash(resolved_config_file,
                                                  commit_hash)
                if resolved_config_file is not None:
                    with open(
                            resolved_config_file, encoding="utf-8") as reader:
                        tokenizer_config = json.load(reader)
                        if "fast_tokenizer_files" in tokenizer_config:
                            fast_tokenizer_file = get_fast_tokenizer_file(
                                tokenizer_config["fast_tokenizer_files"])
                vocab_files["tokenizer_file"] = fast_tokenizer_file

        # Get files from url, cache, or disk depending on the case
        resolved_vocab_files = {}
        unresolved_files = []
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
            elif single_file_id == file_id:
                if os.path.isfile(file_path):
                    resolved_vocab_files[file_id] = file_path
                elif is_remote_url(file_path):
                    resolved_vocab_files[file_id] = download_url(
                        file_path, proxies=proxies)
            else:
                if subfolder is None:
                    subfolder = ""
                path_or_repo_id = str(pretrained_model_name_or_path)
                if os.path.isdir(path_or_repo_id):
                    resolved_file = os.path.join(
                        os.path.join(path_or_repo_id, subfolder), file_path)
                    if not os.path.isfile(resolved_file):
                        if _raise_exceptions_for_missing_entries:
                            raise EnvironmentError(
                                f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
                                f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files."
                            )
                        else:
                            resolved_file = None
                    resolved_vocab_files[file_id] = resolved_file

                else:
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
                        cache_dir=cache_dir, )

        if len(unresolved_files) > 0:
            logger.info(
                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
                "files are necessary for the tokenizer to operate.")

        if all(full_file_name is None
               for full_file_name in resolved_vocab_files.values()):
            raise EnvironmentError(
                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
                f"containing all relevant files for a {cls.__name__} tokenizer."
            )

        for file_id, file_path in vocab_files.items():
            if file_id not in resolved_vocab_files:
                continue

            #if is_local:
            #    logger.info(f"loading file {file_path}")
            #else:
            #    logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")

        return cls._from_pretrained(
            resolved_vocab_files,
            pretrained_model_name_or_path,
            init_configuration,
            *init_inputs,
            use_auth_token=use_auth_token,
            cache_dir=cache_dir,
            local_files_only=local_files_only,
            _commit_hash=commit_hash,
            **kwargs, )

    @classmethod
    def _from_pretrained(cls,
                         resolved_vocab_files,
                         pretrained_model_name_or_path,
                         init_configuration,
                         *init_inputs,
                         use_auth_token=None,
                         cache_dir=None,
                         local_files_only=False,
                         _commit_hash=None,
                         **kwargs):
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
        # file or if `from_slow` is set to True.
        from_slow = kwargs.get("from_slow", False)
        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file",
                                                      None) is not None
        if from_slow:
            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
                copy.deepcopy(init_configuration),
                *init_inputs,
                use_auth_token=use_auth_token,
                cache_dir=cache_dir,
                local_files_only=local_files_only,
                _commit_hash=_commit_hash,
                **(copy.deepcopy(kwargs)), )
        else:
            slow_tokenizer = None

        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop(
            "tokenizer_config_file", None)
        if tokenizer_config_file is not None:
            with open(
                    tokenizer_config_file,
                    encoding="utf-8") as tokenizer_config_handle:
                init_kwargs = json.load(tokenizer_config_handle)
            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
            config_tokenizer_class = init_kwargs.get("tokenizer_class")
            init_kwargs.pop("tokenizer_class", None)
            init_kwargs.pop("auto_map", None)
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
        else:
            config_tokenizer_class = None
            init_kwargs = init_configuration

        if config_tokenizer_class is None:
            try:
                config_dict = resolved_vocab_files.pop("config_file",
                                                       CONFIG_NAME)
                config_dict = os.path.join(pretrained_model_name_or_path,
                                           config_dict)
                config_dict = cls._dict_from_json_file(config_dict)
                config_tokenizer_class = config_dict[
                    "tokenizer_class"] if "tokenizer_class" in config_dict else None
            except (OSError, ValueError, KeyError):
                # skip if an error occurred.
                config_dict = None
            if config_tokenizer_class is None:
                # Third attempt. If we have not yet found the original type of the tokenizer,
                # we are loading we see if we can infer it from the type of the configuration file
                from ppfleetx.data.tokenizers.tokenization_utils_base import TOKENIZER_MAPPING_NAMES  # tests_ignore

                model_type = config_dict[
                    "model_type"] if "model_type" in config_dict else None
                if model_type is None:
                    # Fallback: use pattern matching on the string.
                    for pattern in TOKENIZER_MAPPING_NAMES.keys():
                        if pattern in str(pretrained_model_name_or_path):
                            model_type = pattern
                            break

                if model_type is not None:
                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
                        model_type, (None, None))
                    if config_tokenizer_class is None:
                        config_tokenizer_class = config_tokenizer_class_fast

        if config_tokenizer_class is not None:
            if cls.__name__.replace(
                    "Fast", "") != config_tokenizer_class.replace("Fast", ""):
                logger.warning(
                    "The tokenizer class you load from this checkpoint is not the same type as the class this"
                    " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
                    f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
                    f" from is '{cls.__name__}'.")

        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

        # Convert AddedTokens serialized as dict to class instances
        def convert_added_tokens(obj: Union[AddedToken, Any]):
            if isinstance(obj, dict) and "__type" in obj and obj[
                    "__type"] == "AddedToken":
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
                return list(convert_added_tokens(o) for o in obj)
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj

        init_kwargs = convert_added_tokens(init_kwargs)

        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings

            model_max_length = cls.max_model_input_sizes[
                pretrained_model_name_or_path]
            if model_max_length is not None and isinstance(model_max_length,
                                                           (int, float)):

                model_max_length = min(
                    init_kwargs.get("model_max_length", int(1e30)),
                    model_max_length)
                # TODO(PVP) - uncomment following line in Transformers v5
                # init_kwargs["model_max_length"] = model_max_length
                # TODO(PVP) - remove in Transformers v5
                # ---
                init_kwargs[
                    "model_max_length"] = cls._eventually_correct_t5_max_length(
                        pretrained_model_name_or_path, model_max_length,
                        init_kwargs.get("model_max_length"))
                # ---

            # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

        init_kwargs["name_or_path"] = pretrained_model_name_or_path

        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
        except OSError:
            raise OSError(
                "Unable to load vocabulary from file. "
                "Please check that the provided vocabulary is accessible and not corrupted."
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
        # Removed: Now done at the base class level
        # tokenizer.init_inputs = init_inputs
        # tokenizer.init_kwargs = init_kwargs

        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop(
            "special_tokens_map_file", None)
        if special_tokens_map_file is not None:
            with open(
                    special_tokens_map_file,
                    encoding="utf-8") as special_tokens_map_handle:
                special_tokens_map = json.load(special_tokens_map_handle)
            for key, value in special_tokens_map.items():
                if key in kwargs and kwargs[key]:
                    # This value has already been redefined by the kwargs
                    # We keep this new value and ignore the one stored in the special_tokens_map_file

                    continue

                if isinstance(value, dict):
                    value = AddedToken(**value)
                elif isinstance(value, list):
                    value = [
                        AddedToken(**token)
                        if isinstance(token, dict) else token
                        for token in value
                    ]
                setattr(tokenizer, key, value)

        # Add supplementary tokens.
        special_tokens = tokenizer.all_special_tokens
        if added_tokens_file is not None:
            with open(
                    added_tokens_file,
                    encoding="utf-8") as added_tokens_handle:
                added_tok_encoder = json.load(added_tokens_handle)

            # Sort added tokens by index
            added_tok_encoder_sorted = list(
                sorted(
                    added_tok_encoder.items(), key=lambda x: x[1]))

            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
            # individual tokens would repeatedly rebuild a trie, which can be slow.
            is_last_special = None
            tokens = []

            for token, index in added_tok_encoder_sorted:
                current_index = len(tokenizer) + len(tokens)
                if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids(
                        token) != index:
                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
                    # index is the current length of the tokenizer (not in vocabulary)
                    raise ValueError(
                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
                        f"{index}.")
                elif not has_tokenizer_file and index != current_index:
                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
                    # current length of the tokenizer.
                    raise ValueError(
                        f"Non-consecutive added token '{token}' found. "
                        f"Should have index {current_index} but has index {index} in saved vocabulary."
                    )

                is_special = bool(token in special_tokens)
                if is_last_special is None or is_last_special == is_special:
                    tokens.append(token)
                else:
                    tokenizer.add_tokens(
                        tokens, special_tokens=is_last_special)
                    tokens = [token]
                is_last_special = is_special

            if tokens:
                tokenizer.add_tokens(tokens, special_tokens=is_last_special)

        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
        added_tokens = tokenizer.sanitize_special_tokens()
        #if added_tokens:
        #    logger.warning_advice(
        #        "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
        #        " fine-tuned or trained."
        #    )

        return tokenizer

    @property
    def vocab_size(self):
        return len(self.vocab)

    @property
    def vocab(self):
        return self._tokenizer.vocab

    def get_vocab(self):
        vocab = self.vocab.copy()
        vocab.update(self.get_added_vocab())
        return vocab

    @classmethod
    def _dict_from_json_file(cls, json_file):
        with open(json_file, "r", encoding="utf-8") as reader:
            text = reader.read()
        return json.loads(text)

    def _tokenize(self, text: str) -> List[str]:
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        if self.do_lower_case:
            text = text.lower()
        return self._tokenizer.tokenize(text)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self._tokenizer.spm.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self._tokenizer.spm.IdToPiece(
            index) if index < self.vocab_size else self.unk_token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        return self._tokenizer.decode(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A DeBERTa sequence has the following format:

        - single sequence: [CLS] X [SEP]
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True)

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + (
                [0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 +
                                                        sep) * [1]

    def prepare_for_tokenization(self,
                                 text,
                                 is_split_into_words=False,
                                 **kwargs):
        add_prefix_space = kwargs.pop("add_prefix_space", False)
        if is_split_into_words or add_prefix_space:
            text = " " + text
        return (text, kwargs)

    def save_vocabulary(self,
                        save_directory: str,
                        filename_prefix: Optional[str]=None) -> Tuple[str]:
        return self._tokenizer.save_pretrained(
            save_directory, filename_prefix=filename_prefix)

    def _eventual_warn_about_too_long_sequence(self,
                                               ids,
                                               max_length,
                                               verbose: bool):
        """
        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
        corresponding model

        Args:
            ids (`List[str]`): The ids produced by the tokenization
            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
            verbose (`bool`): Whether or not to print more information and warnings.

        """
        if max_length is None and len(ids) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get(
                    "sequence-length-is-longer-than-the-specified-maximum",
                    False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors")
            self.deprecation_warnings[
                "sequence-length-is-longer-than-the-specified-maximum"] = True

    def _get_padding_truncation_strategies(self,
                                           padding=False,
                                           truncation=False,
                                           max_length=None,
                                           pad_to_multiple_of=None,
                                           verbose=True,
                                           **kwargs):
        """
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy",
                                             "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
                if not self.deprecation_warnings.get(
                        "Truncation-not-explicitly-activated", False):
                    logger.warning(
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
                        " `truncation`.")
                self.deprecation_warnings[
                    "Truncation-not-explicitly-activated"] = True
            truncation = "longest_first"

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
                    FutureWarning, )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                if verbose:
                    if max_length is not None and (
                            truncation is False or
                            truncation == "do_not_truncate"):
                        warnings.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`.")
                    if old_pad_to_max_length is not False:
                        warnings.warn(
                            "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
                        )
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
            elif not isinstance(padding, PaddingStrategy):
                padding_strategy = PaddingStrategy(padding)
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
                    " `truncation=True` to truncate examples to a max length. You can give a specific length with"
                    " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
                    " size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific"
                    " truncation strategy selected among `truncation='only_first'` (will only truncate the first"
                    " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
                    " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
                    " in the pairs).",
                    FutureWarning, )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
                truncation_strategy = TruncationStrategy(truncation)
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-pad-to-max_length", False):
                            logger.warning(
                                "Asking to pad to max_length but no maximum length is provided and the model has no"
                                " predefined maximum length. Default to no padding."
                            )
                        self.deprecation_warnings[
                            "Asking-to-pad-to-max_length"] = True
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-truncate-to-max_length", False):
                            logger.warning(
                                "Asking to truncate to max_length but no maximum length is provided and the model has"
                                " no predefined maximum length. Default to no truncation."
                            )
                        self.deprecation_warnings[
                            "Asking-to-truncate-to-max_length"] = True
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
                not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and
                padding_strategy != PaddingStrategy.DO_NOT_PAD and
                pad_to_multiple_of is not None and max_length is not None and
            (max_length % pad_to_multiple_of != 0)):
            raise ValueError(
                "Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

        return padding_strategy, truncation_strategy, max_length, kwargs

    def _pad(self,
             encoded_inputs,
             max_length=None,
             padding_strategy=PaddingStrategy.DO_NOT_PAD,
             pad_to_multiple_of=None,
             return_attention_mask=None):
        """
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        required_input = encoded_inputs[self.model_input_names[0]]

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = len(required_input)

        if max_length is not None and pad_to_multiple_of is not None and (
                max_length % pad_to_multiple_of != 0):
            max_length = (
                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
            required_input) != max_length

        # Initialize attention mask if not present.
        if return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

        if needs_to_be_padded:
            difference = max_length - len(required_input)

            if self.padding_side == "right":
                if return_attention_mask:

                    encoded_inputs["attention_mask"] = encoded_inputs[
                        "attention_mask"] + [0] * difference
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] +
                        [self.pad_token_type_id] * difference)
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs[
                        "special_tokens_mask"] + [1] * difference
                encoded_inputs[self.model_input_names[
                    0]] = required_input + [self.pad_token_id] * difference
            elif self.padding_side == "left":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [
                        0
                    ] * difference + encoded_inputs["attention_mask"]
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [
                        self.pad_token_type_id
                    ] * difference + encoded_inputs["token_type_ids"]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [
                        1
                    ] * difference + encoded_inputs["special_tokens_mask"]
                encoded_inputs[self.model_input_names[
                    0]] = [self.pad_token_id] * difference + required_input
            else:
                raise ValueError("Invalid padding strategy:" + str(
                    self.padding_side))

        return encoded_inputs

    def pad(
            self,
            encoded_inputs,
            padding=True,
            max_length=None,
            pad_to_multiple_of=None,
            return_attention_mask=None,
            return_tensors=None,
            verbose=True, ):
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.

        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
        `self.pad_token_id` and `self.pad_token_type_id`)

        <Tip>

        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.

        </Tip>

        Args:
            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.

                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
                the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the `return_outputs` attribute.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            verbose (`bool`, *optional*, defaults to `True`):
                Whether or not to print more information and warnings.
        """
        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(
                encoded_inputs[0], Mapping):
            encoded_inputs = {
                key: [example[key] for example in encoded_inputs]
                for key in encoded_inputs[0].keys()
            }

        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method "
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        if not required_input:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = []
            return encoded_inputs

        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch

        first_element = required_input[0]
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            for item in required_input:
                if len(item) != 0:
                    first_element = item[0]
                    break
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
            if is_tf_available() and _is_tensorflow(first_element):
                return_tensors = "tf" if return_tensors is None else return_tensors
            elif is_torch_available() and _is_torch(first_element):
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    "Should be one of a python, numpy, pytorch or tensorflow object."
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

        # Convert padding_strategy in PaddingStrategy
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, verbose=verbose)

        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
            encoded_inputs = self._pad(
                encoded_inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = max(len(inputs) for inputs in required_input)
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        return BatchEncoding(batch_outputs, tensor_type=return_tensors)

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]` of zeros.
        """
        eos = [self.eos_token_id]

        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]
        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    def _add_eos_if_not_present(self, token_ids):
        """Do not add eos again if user already added it."""
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            warnings.warn(
                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
                " eos tokens being added.")
            return token_ids
        else:
            return token_ids + [self.eos_token_id]

    def truncate_sequences(self,
                           ids,
                           pair_ids=None,
                           num_tokens_to_remove=0,
                           truncation_strategy="longest_first",
                           stride=0):
        """
        Truncates a sequence pair in-place following the strategy.

        Args:
            ids (`List[int]`):
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
            num_tokens_to_remove (`int`, *optional*, defaults to 0):
                Number of tokens to remove using the truncation strategy.
            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                The strategy to follow for truncation. Can be:

                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will truncate
                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
                  batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
                  than the model maximum admissible input size).
            stride (`int`, *optional*, defaults to 0):
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.

        Returns:
            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
            of sequences (or a batch of pairs) is provided.
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        if not isinstance(truncation_strategy, TruncationStrategy):
            truncation_strategy = TruncationStrategy(truncation_strategy)

        overflowing_tokens = []
        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
                truncation_strategy == TruncationStrategy.LONGEST_FIRST and
                pair_ids is None):
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
                if self.truncation_side == "left":
                    overflowing_tokens = ids[:window_len]
                    ids = ids[num_tokens_to_remove:]
                elif self.truncation_side == "right":
                    overflowing_tokens = ids[-window_len:]
                    ids = ids[:-num_tokens_to_remove]
                else:
                    raise ValueError(
                        f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'."
                    )

            else:
                error_msg = (
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
                    f"but the first sequence has a length {len(ids)}. ")
                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                    error_msg = (
                        error_msg +
                        "Please select another truncation strategy than "
                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                    )
                logger.error(error_msg)
        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
            logger.warning(
                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
                "truncation strategy. So the returned list will always be empty even if some "
                "tokens have been removed.")
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
                    if self.truncation_side == "right":
                        ids = ids[:-1]
                    elif self.truncation_side == "left":
                        ids = ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(
                            self.truncation_side))
                else:
                    if self.truncation_side == "right":
                        pair_ids = pair_ids[:-1]
                    elif self.truncation_side == "left":
                        pair_ids = pair_ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(
                            self.truncation_side))
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
                if self.truncation_side == "right":
                    overflowing_tokens = pair_ids[-window_len:]
                    pair_ids = pair_ids[:-num_tokens_to_remove]
                elif self.truncation_side == "left":
                    overflowing_tokens = pair_ids[:window_len]
                    pair_ids = pair_ids[num_tokens_to_remove:]
                else:
                    raise ValueError("invalid truncation strategy:" + str(
                        self.truncation_side))
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
                    "for instance 'longest_first' or 'only_first'.")

        return (ids, pair_ids, overflowing_tokens)

    def num_special_tokens_to_add(self, pair: bool=False) -> int:
        """
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        """
        token_ids_0 = []
        token_ids_1 = []
        return len(
            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
                                                  if pair else None))

    def prepare_for_model(self,
                          ids,
                          pair_ids=None,
                          add_special_tokens=True,
                          padding=False,
                          truncation=False,
                          max_length=None,
                          stride=0,
                          pad_to_multiple_of=None,
                          return_tensors=None,
                          return_token_type_ids=None,
                          return_attention_mask=None,
                          return_overflowing_tokens=False,
                          return_special_tokens_mask=False,
                          return_offsets_mapping=False,
                          return_length=False,
                          verbose=True,
                          prepend_batch_axis=False,
                          **kwargs):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
        overflowing tokens. Such a combination of arguments will raise an error.

        Args:
            ids (`List[int]`):
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs, )

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

        if return_token_type_ids and not add_special_tokens:
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None.")

        if (return_overflowing_tokens and
                truncation_strategy == TruncationStrategy.LONGEST_FIRST and
                pair_ids is not None):
            raise ValueError(
                "Not possible to return overflowing tokens for pair of sequences with the "
                "`longest_first`. Please select another truncation strategy than `longest_first`, "
                "for instance `only_second` or `only_first`.")

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
            pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
        overflowing_tokens = []
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride, )

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(
                ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids)
                                               if pair else [])

        # Build output dictionary
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs[
                    "special_tokens_mask"] = self.get_special_tokens_mask(
                        ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
        self._eventual_warn_about_too_long_sequence(
            encoded_inputs["input_ids"], max_length, verbose)

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding_strategy.value,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs,
            tensor_type=return_tensors,
            prepend_batch_axis=prepend_batch_axis)
        return batch_outputs

    def _batch_prepare_for_model(
            self,
            batch_ids_pairs,
            add_special_tokens=True,
            padding_strategy=PaddingStrategy.DO_NOT_PAD,
            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
            max_length=None,
            stride=0,
            pad_to_multiple_of=None,
            return_tensors=None,
            return_token_type_ids=None,
            return_attention_mask=None,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            return_length=False,
            verbose=True, ):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        batch_outputs = {}
        for first_ids, second_ids in batch_ids_pairs:
            outputs = self.prepare_for_model(
                first_ids,
                second_ids,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.
                value,  # we pad in batch afterward
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose, )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask, )

        batch_outputs = BatchEncoding(
            batch_outputs, tensor_type=return_tensors)

        return batch_outputs

    def _get_padding_truncation_strategies(self,
                                           padding=False,
                                           truncation=False,
                                           max_length=None,
                                           pad_to_multiple_of=None,
                                           verbose=True,
                                           **kwargs):
        """
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy",
                                             "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
                if not self.deprecation_warnings.get(
                        "Truncation-not-explicitly-activated", False):
                    logger.warning(
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
                        " `truncation`.")
                self.deprecation_warnings[
                    "Truncation-not-explicitly-activated"] = True
            truncation = "longest_first"

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
                    FutureWarning, )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                if verbose:
                    if max_length is not None and (
                            truncation is False or
                            truncation == "do_not_truncate"):
                        warnings.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`.")
                    if old_pad_to_max_length is not False:
                        warnings.warn(
                            "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
                        )
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
            elif not isinstance(padding, PaddingStrategy):
                padding_strategy = PaddingStrategy(padding)
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
                    " `truncation=True` to truncate examples to a max length. You can give a specific length with"
                    " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
                    " size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific"
                    " truncation strategy selected among `truncation='only_first'` (will only truncate the first"
                    " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
                    " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
                    " in the pairs).",
                    FutureWarning, )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
                truncation_strategy = TruncationStrategy(truncation)
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-pad-to-max_length", False):
                            logger.warning(
                                "Asking to pad to max_length but no maximum length is provided and the model has no"
                                " predefined maximum length. Default to no padding."
                            )
                        self.deprecation_warnings[
                            "Asking-to-pad-to-max_length"] = True
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-truncate-to-max_length", False):
                            logger.warning(
                                "Asking to truncate to max_length but no maximum length is provided and the model has"
                                " no predefined maximum length. Default to no truncation."
                            )
                        self.deprecation_warnings[
                            "Asking-to-truncate-to-max_length"] = True
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
                not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and
                padding_strategy != PaddingStrategy.DO_NOT_PAD and
                pad_to_multiple_of is not None and max_length is not None and
            (max_length % pad_to_multiple_of != 0)):
            raise ValueError(
                "Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

        return padding_strategy, truncation_strategy, max_length, kwargs

    def batch_encode_plus(self,
                          batch_text_or_text_pairs,
                          add_special_tokens=True,
                          padding=False,
                          truncation=False,
                          max_length=None,
                          stride=0,
                          is_split_into_words=False,
                          pad_to_multiple_of=None,
                          return_tensors=None,
                          return_token_type_ids=None,
                          return_attention_mask=None,
                          return_overflowing_tokens=False,
                          return_special_tokens_mask=False,
                          return_offsets_mapping=False,
                          return_length=False,
                          verbose=True,
                          **kwargs):
        """
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in `encode_plus`).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs, )

        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs, )

    def _batch_encode_plus(
            self,
            batch_text_or_text_pairs,
            add_special_tokens=True,
            padding_strategy=PaddingStrategy.DO_NOT_PAD,
            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
            max_length=None,
            stride=0,
            is_split_into_words=False,
            pad_to_multiple_of=None,
            return_tensors=None,
            return_token_type_ids=None,
            return_attention_mask=None,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            return_offsets_mapping=False,
            return_length=False,
            verbose=True,
            **kwargs):
        def get_input_ids(text):
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], str):
                if is_split_into_words:
                    tokens = list(
                        itertools.chain(*(self.tokenize(
                            t, is_split_into_words=True, **kwargs)
                                          for t in text)))
                    return self.convert_tokens_to_ids(tokens)
                else:
                    return self.convert_tokens_to_ids(text)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], int):
                return text
            else:
                raise ValueError(
                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                )

        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast.")

        input_ids = []
        for ids_or_pair_ids in batch_text_or_text_pairs:
            if not isinstance(ids_or_pair_ids, (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            elif is_split_into_words and not isinstance(ids_or_pair_ids[0],
                                                        (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            else:
                ids, pair_ids = ids_or_pair_ids

            first_ids = get_input_ids(ids)
            second_ids = get_input_ids(
                pair_ids) if pair_ids is not None else None
            input_ids.append((first_ids, second_ids))

        batch_outputs = self._batch_prepare_for_model(
            input_ids,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose, )

        return BatchEncoding(batch_outputs)

    def tokenize(self, text, **kwargs):
        """
        Converts a string in a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            **kwargs (additional keyword arguments):
                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            `List[str]`: The list of tokens.
        """
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
        all_special_tokens_extended = dict(
            (str(t), t) for t in self.all_special_tokens_extended
            if isinstance(t, AddedToken))

        text, kwargs = self.prepare_for_tokenization(text, **kwargs)

        if kwargs:
            logger.warning(f"Keyword arguments {kwargs} not recognized.")

        # TODO: should this be in the base class?
        if hasattr(self, "do_lower_case") and self.do_lower_case:
            # convert non-special tokens to lowercase
            escaped_special_toks = [
                re.escape(s_tok)
                for s_tok in (self.unique_no_split_tokens +
                              self.all_special_tokens)
            ]
            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
            text = re.sub(pattern,
                          lambda m: m.groups()[0] or m.groups()[1].lower(),
                          text)

        no_split_token = set(self.unique_no_split_tokens)
        tokens = self.tokens_trie.split(text)
        # ["This is something", "<special_token_1>", "  else"]
        for i, token in enumerate(tokens):
            if token in no_split_token:
                tok_extended = all_special_tokens_extended.get(token, None)
                left = tokens[i - 1] if i > 0 else None
                right = tokens[i + 1] if i < len(tokens) - 1 else None
                if isinstance(tok_extended, AddedToken):
                    if tok_extended.rstrip and right:
                        # A bit counter-intuitive but we strip the left of the string
                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
                        tokens[i + 1] = right.lstrip()
                    # Strip white spaces on the left
                    if tok_extended.lstrip and left:
                        tokens[i - 1] = left.rstrip()  # Opposite here
                else:
                    # We strip left and right by default
                    if right:
                        tokens[i + 1] = right.lstrip()
                    if left:
                        tokens[i - 1] = left.rstrip()
        # ["This is something", "<special_token_1>", "else"]
        tokenized_text = []
        for token in tokens:
            # Need to skip eventual empty (fully stripped) tokens
            if not token:
                continue
            if token in no_split_token:
                tokenized_text.append(token)
            else:
                tokenized_text.extend(self._tokenize(token))
        # ["This", " is", " something", "<special_token_1>", "else"]
        return tokenized_text


class SPMTokenizer:
    r"""
    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

    def __init__(self,
                 vocab_file,
                 split_by_punct=False,
                 sp_model_kwargs: Optional[Dict[str, Any]]=None):
        self.split_by_punct = split_by_punct
        self.vocab_file = vocab_file
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        if not os.path.exists(vocab_file):
            raise FileNotFoundError(f"{vocab_file} does not exist!")
        spm.load(vocab_file)
        bpe_vocab_size = spm.GetPieceSize()
        # Token map
        # <unk> 0+1
        # <s> 1+1
        # </s> 2+1
        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
        # self.vocab['[PAD]'] = 0
        # self.vocab['[CLS]'] = 1
        # self.vocab['[SEP]'] = 2
        # self.vocab['[UNK]'] = 3

        self.spm = spm

    def __getstate__(self):
        state = self.__dict__.copy()
        state["spm"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d

        # for backward compatibility
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        self.spm.Load(self.vocab_file)

    def tokenize(self, text):
        return self._encode_as_pieces(text)

    def convert_ids_to_tokens(self, ids):
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens

    def decode(self, tokens, start=-1, end=-1, raw_text=None):
        if raw_text is None:
            return self.spm.decode_pieces([t for t in tokens])
        else:
            words = self.split_to_words(raw_text)
            word_tokens = [self.tokenize(w) for w in words]
            token2words = [0] * len(tokens)
            tid = 0
            for i, w in enumerate(word_tokens):
                for k, t in enumerate(w):
                    token2words[tid] = i
                    tid += 1
            word_start = token2words[start]
            word_end = token2words[end] if end < len(tokens) else len(words)
            text = "".join(words[word_start:word_end])
            return text

    def add_special_token(self, token):
        if token not in self.special_tokens:
            self.special_tokens.append(token)
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab) - 1
                self.ids_to_tokens.append(token)
        return self.id(token)

    def part_of_whole_word(self, token, is_bos=False):
        if is_bos:
            return True
        if (len(token) == 1 and
            (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or
             _is_punctuation(list(token)[0]))) or token in self.special_tokens:
            return False

        word_start = b"\xe2\x96\x81".decode("utf-8")
        return not token.startswith(word_start)

    def pad(self):
        return "[PAD]"

    def bos(self):
        return "[CLS]"

    def eos(self):
        return "[SEP]"

    def unk(self):
        return "[UNK]"

    def mask(self):
        return "[MASK]"

    def sym(self, id):
        return self.ids_to_tokens[id]

    def id(self, sym):
        return self.vocab[sym] if sym in self.vocab else 1

    def _encode_as_pieces(self, text):
        text = convert_to_unicode(text)
        if self.split_by_punct:
            words = self._run_split_on_punc(text)
            pieces = [self.spm.encode(w, out_type=str) for w in words]
            return [p for w in pieces for p in w]
        else:
            return self.spm.encode(text, out_type=str)

    def split_to_words(self, text):
        pieces = self._encode_as_pieces(text)
        word_start = b"\xe2\x96\x81".decode("utf-8")
        words = []
        offset = 0
        prev_end = 0
        for i, p in enumerate(pieces):
            if p.startswith(word_start):
                if offset > prev_end:
                    words.append(text[prev_end:offset])
                prev_end = offset
                w = p.replace(word_start, "")
            else:
                w = p
            try:
                s = text.index(w, offset)
                pn = ""
                k = i + 1
                while k < len(pieces):
                    pn = pieces[k].replace(word_start, "")
                    if len(pn) > 0:
                        break
                    k += 1

                if len(pn) > 0 and pn in text[offset:s]:
                    offset = offset + 1
                else:
                    offset = s + len(w)
            except Exception:
                offset = offset + 1

        if prev_end < offset:
            words.append(text[prev_end:offset])

        return words

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def save_pretrained(self, path: str, filename_prefix: str=None):
        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
        if filename_prefix is not None:
            filename = filename_prefix + "-" + filename
        full_path = os.path.join(path, filename)
        with open(full_path, "wb") as fs:
            fs.write(self.spm.serialized_model_proto())
        return (full_path, )


def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically control characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False


def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode("utf-8", "ignore")
    else:
        raise ValueError(f"Unsupported string type: {type(text)}")


================================================
FILE: ppfleetx/data/tokenizers/ernie_tokenizer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddlenlp.transformers import ErnieTokenizer

tokenizer = None


def get_ernie_tokenizer(tokenizer_type):
    global tokenizer
    if tokenizer is None:
        tokenizer = ErnieTokenizer.from_pretrained(tokenizer_type)

    return tokenizer


================================================
FILE: ppfleetx/data/tokenizers/gpt_tokenizer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import sys
import json
import logging
import warnings
import os
import regex as re
from io import open

from ppfleetx.utils.download import cached_path

try:
    from functools import lru_cache
except ImportError:
    # Just a dummy decorator to get the checks to run on python2
    # because honestly I don't want to support a byte-level unicode BPE
    # tokenizer on python 2 right now.
    def lru_cache():
        return lambda func: func


from ppfleetx.utils.log import logger

try:
    import paddlenlp
    from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer
except ImportError:
    raise ImportError("Please import paddlenlp before running the GPT tasks.")

PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'gpt2': "http://fleet.bj.bcebos.com/datasets/gpt/gpt2-vocab.json",
}
PRETRAINED_MERGES_ARCHIVE_MAP = {
    'gpt2': "http://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {'gpt2': 1024, }
VOCAB_NAME = 'vocab.json'
MERGES_NAME = 'merges.txt'
SPECIAL_TOKENS_NAME = 'special_tokens.txt'


@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    _chr = unichr if sys.version_info[0] == 2 else chr
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
        list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [_chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


class GPTTokenizer(object):
    """
    GPT-2 BPE tokenizer. Peculiarities:
        - Byte-level BPE
    """

    padding_side = "right"
    truncation_side = "right"
    model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
    pad_token_type_id = 0
    pad_token_id = 0

    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        cache_dir=None,
                        *inputs,
                        **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
                pretrained_model_name_or_path]
            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[
                pretrained_model_name_or_path]
            special_tokens_file = None
        else:
            vocab_file = os.path.join(pretrained_model_name_or_path,
                                      VOCAB_NAME)
            merges_file = os.path.join(pretrained_model_name_or_path,
                                       MERGES_NAME)
            special_tokens_file = os.path.join(pretrained_model_name_or_path,
                                               SPECIAL_TOKENS_NAME)
            if not os.path.exists(special_tokens_file):
                special_tokens_file = None
            else:
                logger.info("loading special tokens file {}".format(
                    special_tokens_file))
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
            resolved_merges_file = cached_path(
                merges_file, cache_dir=cache_dir)
        except Exception as e:
            logger.info(e)
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find files {} and {} "
                "at this path or url.".format(
                    pretrained_model_name_or_path, ', '.join(
                        PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    pretrained_model_name_or_path, vocab_file, merges_file))
            return None
        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
            logger.info("loading merges file {}".format(merges_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
            logger.info("loading merges file {} from cache at {}".format(
                merges_file, resolved_merges_file))
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        if special_tokens_file and 'special_tokens' not in kwargs:
            special_tokens = open(
                special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
        else:
            special_tokens = kwargs.pop('special_tokens', [])
        tokenizer = cls(resolved_vocab_file,
                        resolved_merges_file,
                        special_tokens=special_tokens,
                        *inputs,
                        **kwargs)
        return tokenizer

    def __init__(self,
                 vocab_file,
                 merges_file,
                 errors='replace',
                 special_tokens=None,
                 max_len=None,
                 **kwargs):

        self.padding_side = kwargs.pop("padding_side", self.padding_side)
        if self.padding_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
            )

        self.truncation_side = kwargs.pop("truncation_side",
                                          self.truncation_side)
        if self.truncation_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}"
            )

        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for
        # capitalized versions of contractions
        self.eod_id = self.encoder['<|endoftext|>']
        self.pat = re.compile(
            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
        )

        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens)

    def __call__(self,
                 text,
                 text_pair=None,
                 add_special_tokens=True,
                 padding=False,
                 truncation=False,
                 max_length=None,
                 pad_to_multiple_of=None,
                 return_token_type_ids=None,
                 return_attention_mask=None,
                 return_overflowing_tokens=False,
                 return_length=False):
        assert padding in [True, False, "longest", "max_length", "do_not_pad"]

        if max_length is not None and padding is False and truncation is False:
            truncation = "longest_first"

        if padding is True:
            padding = "longest"
        elif padding is False:
            padding = "do_not_pad"

        assert truncation in [
            True, False, "only_first", "only_second", "longest_first",
            "do_not_truncate"
        ]
        if truncation is True:
            truncation = "longest_first"
        elif truncation is False:
            truncation = "do_not_truncate"

        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (truncation != "do_not_truncate" and padding != "do_not_pad" and
                pad_to_multiple_of is not None and max_length is not None and
            (max_length % pad_to_multiple_of != 0)):
            raise ValueError(
                "Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

        is_batched = isinstance(text, (list, tuple))
        if is_batched:
            raise NotImplementedError
        else:
            return self.encode_plus(
                text=text,
                text_pair=text_pair,
                add_special_tokens=add_special_tokens,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
                pad_to_multiple_of=pad_to_multiple_of,
                return_token_type_ids=return_token_type_ids,
                return_attention_mask=return_attention_mask,
                return_overflowing_tokens=return_overflowing_tokens,
                return_length=return_length)

    def encode_plus(self,
                    text,
                    text_pair,
                    add_special_tokens=True,
                    padding="do_not_pad",
                    truncation="do_not_truncate",
                    max_length=None,
                    pad_to_multiple_of=None,
                    return_token_type_ids=None,
                    return_attention_mask=None,
                    return_overflowing_tokens=False,
                    return_length=False,
                    **kwargs):
        def get_input_ids(text):
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], str):
                if is_split_into_words:
                    tokens = list(
                        itertools.chain(*(self.tokenize(
                            t, is_split_into_words=True, **kwargs)
                                          for t in text)))
                    return self.convert_tokens_to_ids(tokens)
                else:
                    return self.convert_tokens_to_ids(text)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], int):
                return text
            else:
                raise NotImplementedError

        first_ids = get_input_ids(text)
        second_ids = get_input_ids(
            text_pair) if text_pair is not None else None

        pair = bool(second_ids is not None)
        len_ids = len(first_ids)
        len_pair_ids = len(second_ids) if pair else 0

        if return_token_type_ids and not add_special_tokens:
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None.")

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}
        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
            pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
        overflowing_tokens = []
        if truncation != "do_not_truncate" and max_length and total_len > max_length:
            first_ids, second_ids, overflowing_tokens = self.truncate_sequences(
                first_ids,
                pair_ids=second_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation=truncation, )
        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(first_ids,
                                                             second_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(
                first_ids, second_ids)
        else:
            sequence = first_ids + second_ids if pair else first_ids
            token_type_ids = [0] * len(first_ids) + ([0] * len(second_ids)
                                                     if pair else [])

        # Build output dictionary
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids

        # Padding
        if padding != "do_not_pad" or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        return encoded_inputs

    def num_special_tokens_to_add(self, pair: bool=False) -> int:
        token_ids_0 = []
        token_ids_1 = []
        return len(
            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
                                                  if pair else None))

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)

    def truncate_sequences(
            self,
            ids,
            pair_ids=None,
            num_tokens_to_remove=0,
            truncation="longest_first",
            stride=0, ):
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        overflowing_tokens = []
        if truncation == "only_first" or (truncation == "longest_first" and
                                          pair_ids is None):
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
                if self.truncation_side == "left":
                    overflowing_tokens = ids[:window_len]
                    ids = ids[num_tokens_to_remove:]
                elif self.truncation_side == "right":
                    overflowing_tokens = ids[-window_len:]
                    ids = ids[:-num_tokens_to_remove]
                else:
                    raise ValueError(
                        f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'."
                    )

            else:
                error_msg = (
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
                    f"but the first sequence has a length {len(ids)}. ")
                if truncation == "only_first":
                    error_msg = (
                        error_msg +
                        "Please select another truncation strategy than "
                        f"{truncation}, for instance 'longest_first' or 'only_second'."
                    )
                logger.error(error_msg)
        elif truncation == "longest_first":
            warnings.warn(
                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
                f" i.e. sequence pairs with the '{truncation}' "
                "truncation strategy. So the returned list will always be empty even if some "
                "tokens have been removed.")
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
                    if self.truncation_side == "right":
                        ids = ids[:-1]
                    elif self.truncation_side == "left":
                        ids = ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(
                            self.truncation_side))
                else:
                    if self.truncation_side == "right":
                        pair_ids = pair_ids[:-1]
                    elif self.truncation_side == "left":
                        pair_ids = pair_ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(
                            self.truncation_side))
        elif truncation == "only_second" and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
                if self.truncation_side == "right":
                    overflowing_tokens = pair_ids[-window_len:]
                    pair_ids = pair_ids[:-num_tokens_to_remove]
                elif self.truncation_side == "left":
                    overflowing_tokens = pair_ids[:window_len]
                    pair_ids = pair_ids[num_tokens_to_remove:]
                else:
                    raise ValueError("invalid truncation strategy:" + str(
                        self.truncation_side))
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation}, "
                    "for instance 'longest_first' or 'only_first'.")

        return (ids, pair_ids, overflowing_tokens)

    def pad(
            self,
            encoded_inputs,
            padding=True,
            max_length=None,
            pad_to_multiple_of=None,
            return_attention_mask=None,
            return_tensors=None,
            verbose=True, ):

        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method "
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        if not required_input:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = []
            return encoded_inputs

        required_input = encoded_inputs[self.model_input_names[0]]

        if required_input and not isinstance(required_input[0], (list, tuple)):
            encoded_inputs = self._pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )
            return encoded_inputs

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding == "longest":
            max_length = max(len(inputs) for inputs in required_input)
            padding = "max_length"

        batch_outputs = {}
        for i in range(batch_size):
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        return encoded_inputs

    def _pad(
            self,
            encoded_inputs,
            max_length=None,
            padding="do_not_pad",
            pad_to_multiple_of=None,
            return_attention_mask=None, ) -> dict:
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names or "attention_mask" in encoded_inputs

        required_input = encoded_inputs[self.model_input_names[0]]

        if padding == "longest":
            max_length = len(required_input)

        if max_length is not None and pad_to_multiple_of is not None and (
                max_length % pad_to_multiple_of != 0):
            max_length = (
                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

        needs_to_be_padded = padding != "do_not_pad" and len(
            required_input) != max_length

        # Initialize attention mask if not present.
        if return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

        if needs_to_be_padded:
            difference = max_length - len(required_input)

            if self.padding_side == "right":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = encoded_inputs[
                        "attention_mask"] + [0] * difference
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] +
                        [self.pad_token_type_id] * difference)
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs[
                        "special_tokens_mask"] + [1] * difference
                if "offset_mapping" in encoded_inputs:
                    encoded_inputs["offset_mapping"] = encoded_inputs[
                        "offset_mapping"] + [(0, 0)] * difference
                if "position_ids" in encoded_inputs:
                    encoded_inputs["position_ids"] = encoded_inputs[
                        "position_ids"] + [0] * difference
                encoded_inputs[self.model_input_names[
                    0]] = required_input + [self.pad_token_id] * difference
            elif self.padding_side == "left":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [
                        0
                    ] * difference + encoded_inputs["attention_mask"]
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [
                        self.pad_token_type_id
                    ] * difference + encoded_inputs["token_type_ids"]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [
                        1
                    ] * difference + encoded_inputs["special_tokens_mask"]
                if "offset_mapping" in encoded_inputs:
                    encoded_inputs["offset_mapping"] = [
                        (0, 0)
                    ] * difference + encoded_inputs["offset_mapping"]
                if "position_ids" in encoded_inputs:
                    encoded_inputs["position_ids"] = [
                        0
                    ] * difference + encoded_inputs["position_ids"]
                encoded_inputs[self.model_input_names[
                    0]] = [self.pad_token_id] * difference + required_input
            else:
                raise ValueError("Invalid padding strategy:" + str(
                    self.padding_side))

        return encoded_inputs

    def __len__(self):
        return len(self.encoder) + len(self.special_tokens)

    def set_special_tokens(self, special_tokens):
        """ Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        """
        if not special_tokens:
            self.special_tokens = {}
            self.special_tokens_decoder = {}
            return
        self.special_tokens = dict((tok, len(self.encoder) + i)
                                   for i, tok in enumerate(special_tokens))
        self.special_tokens_decoder = {
            v: k
            for k, v in self.special_tokens.items()
        }
        logger.info("Special tokens {}".format(self.special_tokens))

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(
                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except BaseException:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[
                        i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            if sys.version_info[0] == 2:
                token = ''.join(self.byte_encoder[ord(b)] for b in token)
            else:
                token = ''.join(self.byte_encoder[b]
                                for b in token.encode('utf-8'))
            bpe_tokens.extend(
                bpe_token for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def convert_tokens_to_ids(self, tokens):
        """ Converts a sequence of tokens into ids using the vocab. """
        ids = []
        if isinstance(tokens, str) or (sys.version_info[0] == 2 and
                                       isinstance(tokens, unicode)):
            if tokens in self.special_tokens:
                return self.special_tokens[tokens]
            else:
                return self.encoder.get(tokens, 0)
        for token in tokens:
            if token in self.special_tokens:
                ids.append(self.special_tokens[token])
            else:
                ids.append(self.encoder.get(token, 0))
        if len(ids) > self.max_len:
            warnings.warn(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this OpenAI GPT model ({} > {}). Running this"
                " sequence through the model will result in indexing errors".
                format(len(ids), self.max_len))
        return ids

    def convert_ids_to_string(self, ids):
        """
        Converts a single index or a sequence of indices to texts.
        Args:
            ids (int|List[int]):
                The token id (or token ids) to be converted to text.
        Returns:
            str: The decoded text.
        Example:
            .. code-block::
                from paddlenlp.transformers import GPTTokenizer
                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))
                # 'Welcome to use PaddlePaddle and PaddleNLP'
        """

        text = ''.join([self.decoder[id] for id in ids])
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            'utf-8', errors=self.errors)
        return text

    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """Converts a sequence of ids in BPE tokens using the vocab."""
        tokens = []
        for i in ids:
            if i in self.special_tokens_decoder:
                if not skip_special_tokens:
                    tokens.append(self.special_tokens_decoder[i])
            else:
                tokens.append(self.decoder[i])
        return tokens

    def encode(self, text):
        return self.convert_tokens_to_ids(self.tokenize(text))

    def decode(self, tokens):
        text = ''.join([
            self.decoder[token] if token in self.decoder.keys() else ''
            for token in tokens
        ])
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            'utf-8', errors=self.errors)
        return text

    def save_vocabulary(self, vocab_path):
        """Save the tokenizer vocabulary and merge files to a directory."""
        if not os.path.isdir(vocab_path):
            logger.error("Vocabulary path ({}) should be a directory".format(
                vocab_path))
            return
        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
        merge_file = os.path.join(vocab_path, MERGES_NAME)
        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)

        with open(vocab_file, 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.encoder, ensure_ascii=False))

        index = 0
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write(u'#version: 0.2\n')
            for bpe_tokens, token_index in sorted(
                    self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    warnings.warn(
                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!".
                        format(merge_file))
                    index = token_index
                writer.write(' '.join(bpe_tokens) + u'\n')
                index += 1

        index = len(self.encoder)
        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
            for token, token_index in sorted(
                    self.special_tokens.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    warnings.warn(
                        "Saving special tokens vocabulary to {}: BPE indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!".
                        format(special_tokens_file))
                    index = token_index
                writer.write(token + u'\n')
                index += 1

        return vocab_file, merge_file, special_tokens_file

    @property
    def vocab_size(self):
        return len(self.encoder)

    @property
    def vocab(self):
        return self.encoder

    @property
    def inv_vocab(self):
        return self.decoder

    @property
    def eos_token_id(self):
        return self.eod_id


================================================
FILE: ppfleetx/data/tokenizers/t5_tokenization_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
 Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
 tokenization_utils_fast.py
"""
import bisect
import itertools
import re
import unicodedata
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Tuple, Union, overload

from .tokenization_utils_base import (
    ENCODE_KWARGS_DOCSTRING,
    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
    INIT_TOKENIZER_DOCSTRING,
    AddedToken,
    BatchEncoding,
    EncodedInput,
    EncodedInputPair,
    PreTokenizedInput,
    PreTokenizedInputPair,
    PreTrainedTokenizerBase,
    TextInput,
    TextInputPair,
    TruncationStrategy, )
from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging

logger = logging.get_logger(__name__)

# Slow tokenizers are saved in a vocabulary plus three separated files
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"


class Trie:
    """
    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
    Loose reference https://en.wikipedia.org/wiki/Trie
    """

    def __init__(self):
        self.data = {}

    def add(self, word):
        """
        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
        The special key `""` is used to represent termination.

        This function is idempotent, adding twice the same word will leave the trie unchanged

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        """
        if not word:
            # Prevent empty string
            return
        ref = self.data
        for char in word:
            ref[char] = char in ref and ref[char] or {}
            ref = ref[char]
        ref[""] = 1

    def split(self, text):
        """
        Will look for the words added to the trie within `text`. Output is the original string splitted along the
        boundaries of the words found.

        This trie will match the longest possible word first !

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS] This is a extra_id_100"]

        >>> trie.add("[CLS]")
        >>> trie.add("extra_id_1")
        >>> trie.add("extra_id_100")
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS]", " This is a ", "extra_id_100"]
        ```
        """
        # indexes are counted left of the chars index.
        # "hello", index 0, is left of h, index 1 is between h and e.
        # index 5 is right of the "o".

        # States are going to capture every possible start (indexes as above)
        # as keys, and have as values, a pointer to the position in the trie
        # where we're at. This is a partial match for now.
        # This enables to keep track of multiple matches while we're iterating
        # the string
        # If the trie contains, "blowing", and "lower" and we encounter the
        # string "blower", we need to split into ["b", "lower"].
        # This is where we need to keep track of multiple possible starts.
        states = OrderedDict()

        # This will contain every indices where we need
        # to cut.
        # We force to cut at offset 0 and len(text) (added later)
        offsets = [0]

        # This is used by the lookahead which needs to skip over
        # some text where the full match exceeded the place in the initial
        # for loop
        skip = 0
        # Main loop, Giving this algorithm O(n) complexity
        for current, current_char in enumerate(text):
            if skip and current < skip:
                # Prevents the lookahead for matching twice
                # like extra_id_100 and id_100
                continue

            # This will track every state
            # that stop matching, we need to stop tracking them.
            # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
            # fail on "b", we need to remove 0 from the valid states.
            to_remove = set()
            # Whenever we found a match, we need to drop everything
            # this is a greedy algorithm, it will match on the first found token
            reset = False

            # In this case, we already have partial matches (But unfinished)
            for start, trie_pointer in states.items():
                if "" in trie_pointer:
                    # This is a final match, we need to reset and
                    # store the results in `offsets`.

                    # Lookahead to match longest first
                    # Important in case of extra_id_1 vs extra_id_100
                    # Here we are also actively looking for other earlier partial
                    # matches
                    # "[CLS]", "L", we need to match CLS even if L is special
                    for lookstart, looktrie_pointer in states.items():
                        if lookstart > start:
                            # This partial match is later, we can stop looking
                            break
                        elif lookstart < start:
                            # This partial match is earlier, the trie pointer
                            # was already updated, so index is + 1
                            lookahead_index = current + 1
                            end = current + 1
                        else:
                            # Here lookstart == start and
                            #      looktrie_pointer == trie_pointer
                            # It wasn't updated yet so indices are current ones
                            lookahead_index = current
                            end = current
                        next_char = text[
                            lookahead_index] if lookahead_index < len(
                                text) else None
                        if "" in looktrie_pointer:
                            start = lookstart
                            end = lookahead_index
                            skip = lookahead_index

                        while next_char in looktrie_pointer:
                            looktrie_pointer = looktrie_pointer[next_char]
                            lookahead_index += 1
                            if "" in looktrie_pointer:
                                start = lookstart
                                end = lookahead_index
                                skip = lookahead_index

                            if lookahead_index == len(text):
                                # End of string
                                break
                            next_char = text[lookahead_index]
                        # End lookahead

                        # Storing and resetting
                    offsets.append(start)
                    offsets.append(end)
                    reset = True
                    break
                elif current_char in trie_pointer:
                    # The current character being looked at has a match within the trie
                    # update the pointer (it will be stored back into states later).
                    trie_pointer = trie_pointer[current_char]

                    # Storing back the new pointer into the states.
                    # Partial matches got longer by one.
                    states[start] = trie_pointer
                else:
                    # The new character has not match in the trie, we need
                    # to stop keeping track of this partial match.
                    # We can't do it directly within the loop because of how
                    # python iteration works
                    to_remove.add(start)

            # Either clearing the full start (we found a real match)
            # Or clearing only the partial matches that didn't work.
            if reset:
                states = {}
            else:
                for start in to_remove:
                    del states[start]

            # If this character is a starting character within the trie
            # start keeping track of this partial match.
            if current >= skip and current_char in self.data:
                states[current] = self.data[current_char]

        # We have a cut at the end with states.
        for start, trie_pointer in states.items():
            if "" in trie_pointer:
                # This is a final match, we need to reset and
                # store the results in `offsets`.
                end = len(text)
                offsets.append(start)
                offsets.append(end)
                # Longest cut is always the one with lower start so the first
                # item so we need to break.
                break

        return self.cut_text(text, offsets)

    def cut_text(self, text, offsets):
        # We have all the offsets now, we just need to do the actual splitting.
        # We need to eventually add the first part of the string and the eventual
        # last part.
        offsets.append(len(text))
        tokens = []
        start = 0
        for end in offsets:
            if start > end:
                logger.error(
                    "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
                    " anyway.")
                continue
            elif start == end:
                # This might happen if there's a match at index 0
                # we're also preventing zero-width cuts in case of two
                # consecutive matches
                continue
            tokens.append(text[start:end])
            start = end

        return tokens


def _is_whitespace(char):
    """Checks whether `char` is a whitespace character."""
    # \t, \n, and \r are technically control characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `char` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `char` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False


def _is_end_of_word(text):
    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
    last_char = text[-1]
    return bool(
        _is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(
            last_char))


def _is_start_of_word(text):
    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
    first_char = text[0]
    return bool(
        _is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(
            first_char))


def _insert_one_token_to_ordered_list(token_list, new_token):
    """
    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
    """
    insertion_idx = bisect.bisect_left(token_list, new_token)
    # Checks if new_token is already in the ordered token_list
    if insertion_idx < len(token_list) and token_list[
            insertion_idx] == new_token:
        # new_token is in token_list, don't add
        return
    else:
        token_list.insert(insertion_idx, new_token)


@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizer(PreTrainedTokenizerBase):
    """
    Base class for all slow tokenizers.

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
    pretrained tokenizers as well as adding tokens to the vocabulary.

    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        # Added tokens - We store this for both slow and fast tokenizers
        # until the serialization of Fast tokenizers is updated
        self.added_tokens_encoder = {}
        self.added_tokens_decoder = {}
        self.unique_no_split_tokens = []
        self.tokens_trie = Trie()

        self._decode_use_source_tokenizer = False

    @property
    def is_fast(self):
        return False

    @property
    def vocab_size(self):
        """
        `int`: Size of the base vocabulary (without the added tokens).
        """
        raise NotImplementedError

    def get_added_vocab(self):
        """
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        """
        return self.added_tokens_encoder

    def __len__(self):
        """
        Size of the full vocabulary with the added tokens.
        """
        return self.vocab_size + len(self.added_tokens_encoder)

    def _add_tokens(self, new_tokens, special_tokens=False):
        """
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary.

        Args:
            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
                checking if the tokenizer assign the index of the `unk_token` to them).
            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            `int`: The number of tokens actually added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
        new_tokens = [str(tok) for tok in new_tokens]

        tokens_to_add = []
        for token in new_tokens:
            if not isinstance(token, str):
                raise TypeError(
                    "Token {token} is not a string but a {type(token)}.")
            if not special_tokens and hasattr(
                    self, "do_lower_case") and self.do_lower_case:
                token = token.lower()
            if (token != self.unk_token and self.convert_tokens_to_ids(token)
                    == self.convert_tokens_to_ids(self.unk_token) and
                    token not in tokens_to_add):
                tokens_to_add.append(token)
                #if self.verbose:
                #    logger.info(f"Adding {token} to the vocabulary")

        added_tok_encoder = dict((tok, len(self) + i)
                                 for i, tok in enumerate(tokens_to_add))
        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
        self.added_tokens_encoder.update(added_tok_encoder)
        self.added_tokens_decoder.update(added_tok_decoder)

        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
        if special_tokens:
            if len(new_tokens) == 1:
                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,
                                                  new_tokens[0])
            else:
                self.unique_no_split_tokens = sorted(
                    set(self.unique_no_split_tokens).union(set(new_tokens)))
        else:
            # Or on the newly added tokens
            if len(tokens_to_add) == 1:
                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,
                                                  tokens_to_add[0])
            else:
                self.unique_no_split_tokens = sorted(
                    set(self.unique_no_split_tokens).union(
                        set(tokens_to_add)))
        self._create_trie(self.unique_no_split_tokens)

        return len(tokens_to_add)

    def _create_trie(self, unique_no_split_tokens):
        trie = Trie()
        for token in unique_no_split_tokens:
            if hasattr(
                    self, "do_lower_case"
            ) and self.do_lower_case and token not in self.all_special_tokens:
                trie.add(token.lower())
            else:
                trie.add(token)
        self.tokens_trie = trie

    def num_special_tokens_to_add(self, pair):
        """
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        """
        token_ids_0 = []
        token_ids_1 = []
        return len(
            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
                                                  if pair else None))

    def tokenize(self, text, **kwargs):
        """
        Converts a string in a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            **kwargs (additional keyword arguments):
                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            `List[str]`: The list of tokens.
        """
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
        all_special_tokens_extended = dict(
            (str(t), t) for t in self.all_special_tokens_extended
            if isinstance(t, AddedToken))

        text, kwargs = self.prepare_for_tokenization(text, **kwargs)

        if kwargs:
            logger.warning("Keyword arguments {kwargs} not recognized.")

        # TODO: should this be in the base class?
        if hasattr(self, "do_lower_case") and self.do_lower_case:
            # convert non-special tokens to lowercase
            escaped_special_toks = [
                re.escape(s_tok)
                for s_tok in (self.unique_no_split_tokens +
                              self.all_special_tokens)
            ]
            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
            text = re.sub(pattern,
                          lambda m: m.groups()[0] or m.groups()[1].lower(),
                          text)

        no_split_token = set(self.unique_no_split_tokens)
        tokens = self.tokens_trie.split(text)
        # ["This is something", "<special_token_1>", "  else"]
        for i, token in enumerate(tokens):
            if token in no_split_token:
                tok_extended = all_special_tokens_extended.get(token, None)
                left = tokens[i - 1] if i > 0 else None
                right = tokens[i + 1] if i < len(tokens) - 1 else None
                if isinstance(tok_extended, AddedToken):
                    if tok_extended.rstrip and right:
                        # A bit counter-intuitive but we strip the left of the string
                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
                        tokens[i + 1] = right.lstrip()
                    # Strip white spaces on the left
                    if tok_extended.lstrip and left:
                        tokens[i - 1] = left.rstrip()  # Opposite here
                else:
                    # We strip left and right by default
                    if right:
                        tokens[i + 1] = right.lstrip()
                    if left:
                        tokens[i - 1] = left.rstrip()
        # ["This is something", "<special_token_1>", "else"]
        tokenized_text = []
        for token in tokens:
            # Need to skip eventual empty (fully stripped) tokens
            if not token:
                continue
            if token in no_split_token:
                tokenized_text.append(token)
            else:
                tokenized_text.extend(self._tokenize(token))
        # ["This", " is", " something", "<special_token_1>", "else"]
        return tokenized_text

    def _tokenize(self, text, **kwargs):
        """
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        """
        raise NotImplementedError

    def convert_tokens_to_ids(self, tokens):
        """
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        """
        if tokens is None:
            return None

        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)

        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
        return ids

    def _convert_token_to_id_with_added_voc(self, token):
        if token is None:
            return None

        if token in self.added_tokens_encoder:
            return self.added_tokens_encoder[token]
        return self._convert_token_to_id(token)

    def _convert_token_to_id(self, token):
        raise NotImplementedError

    def _encode_plus(self,
                     text,
                     text_pair=None,
                     add_special_tokens=True,
                     padding_strategy=PaddingStrategy.DO_NOT_PAD,
                     truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
                     max_length=None,
                     stride=0,
                     is_split_into_words=False,
                     pad_to_multiple_of=None,
                     return_tensors=None,
                     return_token_type_ids=None,
                     return_attention_mask=None,
                     return_overflowing_tokens=False,
                     return_special_tokens_mask=False,
                     return_offsets_mapping=False,
                     return_length=False,
                     verbose=True,
                     **kwargs):
        def get_input_ids(text):
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], str):
                if is_split_into_words:
                    tokens = list(
                        itertools.chain(*(self.tokenize(
                            t, is_split_into_words=True, **kwargs)
                                          for t in text)))
                    return self.convert_tokens_to_ids(tokens)
                else:
                    return self.convert_tokens_to_ids(text)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], int):
                return text
            else:
                if is_split_into_words:
                    raise ValueError(
                        "Input {text} is not valid. Should be a string or a list/tuple of strings when"
                        " `is_split_into_words=True`.")
                else:
                    raise ValueError(
                        "Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
                        " integers.")

        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674")

        first_ids = get_input_ids(text)
        second_ids = get_input_ids(
            text_pair) if text_pair is not None else None

        return self.prepare_for_model(
            first_ids,
            pair_ids=second_ids,
            add_special_tokens=add_special_tokens,
            padding=padding_strategy.value,
            truncation=truncation_strategy.value,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            prepend_batch_axis=True,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            verbose=verbose, )

    def _batch_encode_plus(
            self,
            batch_text_or_text_pairs,
            add_special_tokens=True,
            padding_strategy=PaddingStrategy.DO_NOT_PAD,
            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
            max_length=None,
            stride=0,
            is_split_into_words=False,
            pad_to_multiple_of=None,
            return_tensors=None,
            return_token_type_ids=None,
            return_attention_mask=None,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            return_offsets_mapping=False,
            return_length=False,
            verbose=True,
            **kwargs):
        def get_input_ids(text):
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], str):
                if is_split_into_words:
                    tokens = list(
                        itertools.chain(*(self.tokenize(
                            t, is_split_into_words=True, **kwargs)
                                          for t in text)))
                    return self.convert_tokens_to_ids(tokens)
                else:
                    return self.convert_tokens_to_ids(text)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], int):
                return text
            else:
                raise ValueError(
                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                )

        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast.")

        input_ids = []
        for ids_or_pair_ids in batch_text_or_text_pairs:
            if not isinstance(ids_or_pair_ids, (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            elif is_split_into_words and not isinstance(ids_or_pair_ids[0],
                                                        (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            else:
                ids, pair_ids = ids_or_pair_ids

            first_ids = get_input_ids(ids)
            second_ids = get_input_ids(
                pair_ids) if pair_ids is not None else None
            input_ids.append((first_ids, second_ids))

        batch_outputs = self._batch_prepare_for_model(
            input_ids,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose, )

        return BatchEncoding(batch_outputs)

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING,
                        ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def _batch_prepare_for_model(
            self,
            batch_ids_pairs,
            add_special_tokens=True,
            padding_strategy=PaddingStrategy.DO_NOT_PAD,
            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
            max_length=None,
            stride=0,
            pad_to_multiple_of=None,
            return_tensors=None,
            return_token_type_ids=None,
            return_attention_mask=None,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            return_length=False,
            verbose=True, ):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        batch_outputs = {}
        for first_ids, second_ids in batch_ids_pairs:
            outputs = self.prepare_for_model(
                first_ids,
                second_ids,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.
                value,  # we pad in batch afterward
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose, )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask, )

        batch_outputs = BatchEncoding(
            batch_outputs, tensor_type=return_tensors)

        return batch_outputs

    def prepare_for_tokenization(self,
                                 text,
                                 is_split_into_words=False,
                                 **kwargs):
        """
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs:
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        """
        return (text, kwargs)

    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formatted with special tokens for the model."
                )

            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True)
        return [0] * ((len(token_ids_1)
                       if token_ids_1 else 0) + len(token_ids_0))

    @overload
    def convert_ids_to_tokens(self, ids: int,
                              skip_special_tokens: bool=False) -> str:
        ...

    @overload
    def convert_ids_to_tokens(self,
                              ids: List[int],
                              skip_special_tokens: bool=False) -> List[str]:
        ...

    def convert_ids_to_tokens(
            self, ids: Union[int, List[int]],
            skip_special_tokens: bool=False) -> Union[str, List[str]]:
        """
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        """
        if isinstance(ids, int):
            if ids in self.added_tokens_decoder:
                return self.added_tokens_decoder[ids]
            else:
                return self._convert_id_to_token(ids)
        tokens = []
        for index in ids:
            index = int(index)
            if skip_special_tokens and index in self.all_special_ids:
                continue
            if index in self.added_tokens_decoder:
                tokens.append(self.added_tokens_decoder[index])
            else:
                tokens.append(self._convert_id_to_token(index))
        return tokens

    def _convert_id_to_token(self, index: int) -> str:
        raise NotImplementedError

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        return " ".join(tokens)

    def _decode(self,
                token_ids: List[int],
                skip_special_tokens: bool=False,
                clean_up_tokenization_spaces: bool=True,
                spaces_between_special_tokens: bool=True,
                **kwargs) -> str:
        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer",
                                                       False)

        filtered_tokens = self.convert_ids_to_tokens(
            token_ids, skip_special_tokens=skip_special_tokens)

        # To avoid mixing byte-level and unicode for byte-level BPT
        # we need to build string separately for added tokens and byte-level tokens
        # cf. https://github.com/huggingface/transformers/issues/1133
        sub_texts = []
        current_sub_text = []
        for token in filtered_tokens:
            if skip_special_tokens and token in self.all_special_ids:
                continue
            if token in self.added_tokens_encoder:
                if current_sub_text:
                    sub_texts.append(
                        self.convert_tokens_to_string(current_sub_text))
                    current_sub_text = []
                sub_texts.append(token)
            else:
                current_sub_text.append(token)
        if current_sub_text:
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))

        if spaces_between_special_tokens:
            text = " ".join(sub_texts)
        else:
            text = "".join(sub_texts)

        if clean_up_tokenization_spaces:
            clean_text = self.clean_up_tokenization(text)
            return clean_text
        else:
            return text


================================================
FILE: ppfleetx/data/tokenizers/t5_tokenizer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Google T5."""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import copy
import sys
import json
import logging
import warnings
import os
import regex as re
from io import open
from collections import OrderedDict
from typing import Any, Dict, List, Optional, Tuple, Union, overload

import sentencepiece as spm

from ppfleetx.utils.download import cached_path
from ppfleetx.data.tokenizers.tokenization_utils_base import (
    _LazyConfigMapping, AddedToken, TruncationStrategy, PaddingStrategy,
    BatchEncoding, SpecialTokensMixin)

try:
    from functools import lru_cache
except ImportError:
    # Just a dummy decorator to get the checks to run on python2
    # because honestly I don't want to support a byte-level unicode BPE
    # tokenizer on python 2 right now.
    def lru_cache():
        return lambda func: func


from ppfleetx.utils.log import logger

MAX_LENGTH = 256

VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
MODEL_FILES_NAMES = {"config_file": "config.json"}
CONFIG_MAPPING_NAMES = OrderedDict([("t5", "T5Config")])

CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        't5-11b': "https://fleet.bj.bcebos.com/datasets/t5/spiece.model",
    }
}

PRETRAINED_MERGES_ARCHIVE_MAP = {
    't5-11b': "https://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt",
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "t5-small": 512,
    "t5-base": 512,
    "t5-large": 512,
    "t5-3b": 512,
    "t5-11b": 512,
}

# Slow tokenizers used to be saved in three separated files
DEFAULT_T5_NAME = "projects/imagen/t5/t5-11b"
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
FULL_TOKENIZER_FILE = "tokenizer.json"
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")


def get_t5_tokenizer(name=DEFAULT_T5_NAME):
    tokenizer = T5Tokenizer.from_pretrained(name)
    return tokenizer


def t5_tokenize(texts, tokenizer):
    encoded = tokenizer.batch_encode_plus(
        texts,
        return_tensors="paddle",
        padding='longest',
        max_length=MAX_LENGTH,
        truncation=True)

    input_ids = encoded.input_ids
    attn_mask = encoded.attention_mask
    return input_ids, attn_mask


class T5Tokenizer(SpecialTokensMixin):
    """
    T5 tokenizer. 
    """
    vocab_files_names = VOCAB_FILES_NAMES
    config_files_names = MODEL_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = None
    padding_side = "right"
    truncation_side = "right"

    def __init__(self,
                 vocab_file,
                 eos_token="</s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 extra_ids=100,
                 additional_special_tokens=None,
                 sp_model_kwargs=None,
                 **kwargs):
        # Add extra_ids to the special token list
        if extra_ids > 0 and additional_special_tokens is None:
            additional_special_tokens = [
                f"<extra_id_{i}>" for i in range(extra_ids)
            ]
        elif extra_ids > 0 and additional_special_tokens is not None:
            # Check that we have the right number of extra_id special tokens
            extra_tokens = len(
                set(
                    filter(lambda x: bool("extra_id" in str(x)),
                           additional_special_tokens)))
            if extra_tokens != extra_ids:
                raise ValueError(
                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
                    " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                    " tokens")

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        super().__init__(
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            extra_ids=extra_ids,
            additional_special_tokens=additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs)
        self.vocab_file = vocab_file
        self._extra_ids = extra_ids

        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)
        self.deprecation_warnings = ({})

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs,
                        **kwargs):
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        resume_download = kwargs.pop("resume_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
        use_auth_token = kwargs.pop("use_auth_token", None)
        revision = kwargs.pop("revision", None)
        subfolder = kwargs.pop("subfolder", None)

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        vocab_files = {}
        init_configuration = {}

        if os.path.isfile(pretrained_model_name_or_path):
            if len(cls.vocab_files_names) > 1:
                raise ValueError(
                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
                )
            warnings.warn(
                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
                FutureWarning, )
            file_id = list(cls.vocab_files_names.keys())[0]
            vocab_files[file_id] = pretrained_model_name_or_path
        else:
            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
            additional_files_names = {
                "added_tokens_file": ADDED_TOKENS_FILE,
                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
            }
            vocab_files_target = {
                ** cls.vocab_files_names, ** cls.config_files_names, **
                additional_files_names
            }

            if "tokenizer_file" in vocab_files_target:
                # Try to get the tokenizer config to see if there are versioned tokenizer files.
                fast_tokenizer_file = FULL_TOKENIZER_FILE
                resolved_config_file = get_file_from_repo(
                    pretrained_model_name_or_path,
                    TOKENIZER_CONFIG_FILE,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    resume_download=resume_download,
                    proxies=proxies,
                    use_auth_token=use_auth_token,
                    revision=revision,
                    local_files_only=local_files_only, )
                if resolved_config_file is not None:
                    with open(
                            resolved_config_file, encoding="utf-8") as reader:
                        tokenizer_config = json.load(reader)
                        if "fast_tokenizer_files" in tokenizer_config:
                            fast_tokenizer_file = get_fast_tokenizer_file(
                                tokenizer_config["fast_tokenizer_files"])
                vocab_files_target["tokenizer_file"] = fast_tokenizer_file

            # Look for the tokenizer files
            for file_id, file_name in vocab_files_target.items():
                if os.path.isdir(pretrained_model_name_or_path):
                    if subfolder is not None:
                        full_file_name = os.path.join(
                            pretrained_model_name_or_path, subfolder,
                            file_name)
                    else:
                        full_file_name = os.path.join(
                            pretrained_model_name_or_path, file_name)
                    if not os.path.exists(full_file_name):
                        #logger.info("Didn't find file {full_file_name}. We won't load it.")
                        full_file_name = None

                vocab_files[file_id] = full_file_name

        # Get files from url, cache, or disk depending on the case
        resolved_vocab_files = {}
        unresolved_files = []
        for file_id, file_path in vocab_files.items():
            if file_path is None:
                resolved_vocab_files[file_id] = None
            else:
                try:
                    resolved_vocab_files[file_id] = cached_path(
                        file_path,
                        cache_dir=cache_dir, )
                except EnvironmentError:
                    logger.error(
                        "Model name '{}' was not found in model name list ({}). "
                        "We assumed '{}' was a path or url but couldn't find files {} and {} "
                        "at this path or url.".format(
                            pretrained_model_name_or_path, ', '.join(
                                PRETRAINED_VOCAB_ARCHIVE_MAP.keys(
                                )), pretrained_model_name_or_path, vocab_file,
                            merges_file))
                    return None

        if all(full_file_name is None
               for full_file_name in resolved_vocab_files.values()):
            raise EnvironmentError(
                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
                f"containing all relevant files for a {cls.__name__} tokenizer."
            )

        for file_id, file_path in vocab_files.items():
            if file_id not in resolved_vocab_files:
                continue

        return cls._from_pretrained(
            resolved_vocab_files,
            pretrained_model_name_or_path,
            init_configuration,
            *init_inputs,
            use_auth_token=use_auth_token,
            cache_dir=cache_dir,
            **kwargs, )

    @classmethod
    def _from_pretrained(cls,
                         resolved_vocab_files,
                         pretrained_model_name_or_path,
                         init_configuration,
                         *init_inputs,
                         use_auth_token=None,
                         cache_dir=None,
                         **kwargs):
        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
        # file or if `from_slow` is set to True.
        from_slow = kwargs.get("from_slow", False)
        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file",
                                                      None) is not None
        if (from_slow or not has_tokenizer_file
            ) and cls.slow_tokenizer_class is not None:
            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
                copy.deepcopy(init_configuration),
                *init_inputs,
                **(copy.deepcopy(kwargs)), )
        else:
            slow_tokenizer = None

        # Prepare tokenizer initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop(
            "tokenizer_config_file", None)
        if tokenizer_config_file is not None:
            with open(
                    tokenizer_config_file,
                    encoding="utf-8") as tokenizer_config_handle:
                init_kwargs = json.load(tokenizer_config_handle)
            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
            config_tokenizer_class = init_kwargs.get("tokenizer_class")
            init_kwargs.pop("tokenizer_class", None)
            init_kwargs.pop("auto_map", None)
            saved_init_inputs = init_kwargs.pop("init_inputs", ())
            if not init_inputs:
                init_inputs = saved_init_inputs
        else:
            config_tokenizer_class = None
            init_kwargs = init_configuration

        if config_tokenizer_class is None:
            # Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
            try:
                config_dict = resolved_vocab_files.pop("config_file", None)
                config_dict = cls._dict_from_json_file(config_dict)
                config_tokenizer_class = config_dict[
                    "tokenizer_class"] if "tokenizer_class" in config_dict else None
            except (OSError, ValueError, KeyError):
                # skip if an error occurred.
                config_dict = None
            if config_tokenizer_class is None:
                # Third attempt. If we have not yet found the original type of the tokenizer,
                # we are loading we see if we can infer it from the type of the configuration file
                from ppfleetx.data.tokenizers.tokenization_utils_base import TOKENIZER_MAPPING_NAMES  # tests_ignore

                model_type = config_dict[
                    "model_type"] if "model_type" in config_dict else None
                if model_type is None:
                    # Fallback: use pattern matching on the string.
                    model_type = None
                    for pattern in TOKENIZER_MAPPING_NAMES.keys():
                        if pattern in str(pretrained_model_name_or_path):
                            model_type = pattern
                            break

                if model_type is not None:
                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
                        model_type, (None, None))
                    if config_tokenizer_class is None:
                        config_tokenizer_class = config_tokenizer_class_fast

        if config_tokenizer_class is not None:
            if cls.__name__.replace(
                    "Fast", "") != config_tokenizer_class.replace("Fast", ""):
                logger.warning(
                    "The tokenizer class you load from this checkpoint is not the same type as the class this"
                    " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
                    f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
                    f" from is '{cls.__name__}'.")

        # Update with newly provided kwargs
        init_kwargs.update(kwargs)

        # Convert AddedTokens serialized as dict to class instances
        def convert_added_tokens(obj):
            if isinstance(obj, dict) and "__type" in obj and obj[
                    "__type"] == "AddedToken":
                obj.pop("__type")
                return AddedToken(**obj)
            elif isinstance(obj, (list, tuple)):
                return list(convert_added_tokens(o) for o in obj)
            elif isinstance(obj, dict):
                return {k: convert_added_tokens(v) for k, v in obj.items()}
            return obj

        init_kwargs = convert_added_tokens(init_kwargs)

        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings

            model_max_length = cls.max_model_input_sizes[
                pretrained_model_name_or_path]
            if model_max_length is not None and isinstance(model_max_length,
                                                           (int, float)):

                model_max_length = min(
                    init_kwargs.get("model_max_length", int(1e30)),
                    model_max_length)
                # TODO(PVP) - uncomment following line in Transformers v5
                # init_kwargs["model_max_length"] = model_max_length
                # TODO(PVP) - remove in Transformers v5
                # ---
                init_kwargs[
                    "model_max_length"] = cls._eventually_correct_t5_max_length(
                        pretrained_model_name_or_path, model_max_length,
                        init_kwargs.get("model_max_length"))
                # ---

            # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
        for args_name, file_path in resolved_vocab_files.items():
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path

        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

        init_kwargs["name_or_path"] = pretrained_model_name_or_path

        # Instantiate tokenizer.
        try:
            tokenizer = cls(**init_kwargs)
        except OSError:
            raise OSError(
                "Unable to load vocabulary from file. "
                "Please check that the provided vocabulary is accessible and not corrupted."
            )

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
        # Removed: Now done at the base class level
        # tokenizer.init_inputs = init_inputs
        # tokenizer.init_kwargs = init_kwargs

        # If there is a complementary special token map, load it
        special_tokens_map_file = resolved_vocab_files.pop(
            "special_tokens_map_file", None)

        # Add supplementary tokens.
        special_tokens = tokenizer.all_special_tokens
        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
        added_tokens = tokenizer.sanitize_special_tokens()
        if added_tokens:
            logger.warning_advice(
                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                " fine-tuned or trained.")

        return tokenizer

    def _eventual_warn_about_too_long_sequence(self,
                                               ids,
                                               max_length,
                                               verbose: bool):
        """
        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
        corresponding model

        Args:
            ids (`List[str]`): The ids produced by the tokenization
            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
            verbose (`bool`): Whether or not to print more information and warnings.

        """
        if max_length is None and len(ids) > self.model_max_length and verbose:
            if not self.deprecation_warnings.get(
                    "sequence-length-is-longer-than-the-specified-maximum",
                    False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors")
            self.deprecation_warnings[
                "sequence-length-is-longer-than-the-specified-maximum"] = True

    def _get_padding_truncation_strategies(self,
                                           padding=False,
                                           truncation=False,
                                           max_length=None,
                                           pad_to_multiple_of=None,
                                           verbose=True,
                                           **kwargs):
        """
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy",
                                             "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
                if not self.deprecation_warnings.get(
                        "Truncation-not-explicitly-activated", False):
                    logger.warning(
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
                        " `truncation`.")
                self.deprecation_warnings[
                    "Truncation-not-explicitly-activated"] = True
            truncation = "longest_first"

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
                    FutureWarning, )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                if verbose:
                    if max_length is not None and (
                            truncation is False or
                            truncation == "do_not_truncate"):
                        warnings.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`.")
                    if old_pad_to_max_length is not False:
                        warnings.warn(
                            "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
                        )
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
            elif not isinstance(padding, PaddingStrategy):
                padding_strategy = PaddingStrategy(padding)
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
                    " `truncation=True` to truncate examples to a max length. You can give a specific length with"
                    " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
                    " size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific"
                    " truncation strategy selected among `truncation='only_first'` (will only truncate the first"
                    " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
                    " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
                    " in the pairs).",
                    FutureWarning, )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
                truncation_strategy = TruncationStrategy(truncation)
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-pad-to-max_length", False):
                            logger.warning(
                                "Asking to pad to max_length but no maximum length is provided and the model has no"
                                " predefined maximum length. Default to no padding."
                            )
                        self.deprecation_warnings[
                            "Asking-to-pad-to-max_length"] = True
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-truncate-to-max_length", False):
                            logger.warning(
                                "Asking to truncate to max_length but no maximum length is provided and the model has"
                                " no predefined maximum length. Default to no truncation."
                            )
                        self.deprecation_warnings[
                            "Asking-to-truncate-to-max_length"] = True
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
                not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and
                padding_strategy != PaddingStrategy.DO_NOT_PAD and
                pad_to_multiple_of is not None and max_length is not None and
            (max_length % pad_to_multiple_of != 0)):
            raise ValueError(
                "Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

        return padding_strategy, truncation_strategy, max_length, kwargs

    def _pad(self,
             encoded_inputs,
             max_length=None,
             padding_strategy=PaddingStrategy.DO_NOT_PAD,
             pad_to_multiple_of=None,
             return_attention_mask=None):
        """
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        """
        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        required_input = encoded_inputs[self.model_input_names[0]]

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = len(required_input)

        if max_length is not None and pad_to_multiple_of is not None and (
                max_length % pad_to_multiple_of != 0):
            max_length = (
                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
            required_input) != max_length

        # Initialize attention mask if not present.
        if return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

        if needs_to_be_padded:
            difference = max_length - len(required_input)

            if self.padding_side == "right":
                if return_attention_mask:

                    encoded_inputs["attention_mask"] = encoded_inputs[
                        "attention_mask"] + [0] * difference
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] +
                        [self.pad_token_type_id] * difference)
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs[
                        "special_tokens_mask"] + [1] * difference
                encoded_inputs[self.model_input_names[
                    0]] = required_input + [self.pad_token_id] * difference
            elif self.padding_side == "left":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [
                        0
                    ] * difference + encoded_inputs["attention_mask"]
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [
                        self.pad_token_type_id
                    ] * difference + encoded_inputs["token_type_ids"]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [
                        1
                    ] * difference + encoded_inputs["special_tokens_mask"]
                encoded_inputs[self.model_input_names[
                    0]] = [self.pad_token_id] * difference + required_input
            else:
                raise ValueError("Invalid padding strategy:" + str(
                    self.padding_side))

        return encoded_inputs

    def pad(
            self,
            encoded_inputs,
            padding=True,
            max_length=None,
            pad_to_multiple_of=None,
            return_attention_mask=None,
            return_tensors=None,
            verbose=True, ):
        """
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.

        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
        `self.pad_token_id` and `self.pad_token_type_id`)

        <Tip>

        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
        PyTorch tensors, you will lose the specific device of your tensors however.

        </Tip>

        Args:
            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
                List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
                collate function.

                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
                the note above for the return type.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                >= 7.5 (Volta).
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific tokenizer's default, defined by the `return_outputs` attribute.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            verbose (`bool`, *optional*, defaults to `True`):
                Whether or not to print more information and warnings.
        """
        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(
                encoded_inputs[0], Mapping):
            encoded_inputs = {
                key: [example[key] for example in encoded_inputs]
                for key in encoded_inputs[0].keys()
            }

        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method "
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        if not required_input:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = []
            return encoded_inputs

        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch

        first_element = required_input[0]
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            for item in required_input:
                if len(item) != 0:
                    first_element = item[0]
                    break
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
            if is_tf_available() and _is_tensorflow(first_element):
                return_tensors = "tf" if return_tensors is None else return_tensors
            elif is_torch_available() and _is_torch(first_element):
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    "Should be one of a python, numpy, pytorch or tensorflow object."
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

        # Convert padding_strategy in PaddingStrategy
        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
            padding=padding, max_length=max_length, verbose=verbose)

        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
            encoded_inputs = self._pad(
                encoded_inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = max(len(inputs) for inputs in required_input)
            padding_strategy = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding_strategy,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        return BatchEncoding(batch_outputs, tensor_type=return_tensors)

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]` of zeros.
        """
        eos = [self.eos_token_id]

        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]
        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    def _add_eos_if_not_present(self, token_ids):
        """Do not add eos again if user already added it."""
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            warnings.warn(
                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
                " eos tokens being added.")
            return token_ids
        else:
            return token_ids + [self.eos_token_id]

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]` of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:
            return token_ids_0
        else:
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            return token_ids_0 + token_ids_1

    def truncate_sequences(self,
                           ids,
                           pair_ids=None,
                           num_tokens_to_remove=0,
                           truncation_strategy="longest_first",
                           stride=0):
        """
        Truncates a sequence pair in-place following the strategy.

        Args:
            ids (`List[int]`):
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
            num_tokens_to_remove (`int`, *optional*, defaults to 0):
                Number of tokens to remove using the truncation strategy.
            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                The strategy to follow for truncation. Can be:

                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will truncate
                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
                  batch of pairs) is provided.
                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
                  than the model maximum admissible input size).
            stride (`int`, *optional*, defaults to 0):
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.

        Returns:
            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
            of sequences (or a batch of pairs) is provided.
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []

        if not isinstance(truncation_strategy, TruncationStrategy):
            truncation_strategy = TruncationStrategy(truncation_strategy)

        overflowing_tokens = []
        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
                truncation_strategy == TruncationStrategy.LONGEST_FIRST and
                pair_ids is None):
            if len(ids) > num_tokens_to_remove:
                window_len = min(len(ids), stride + num_tokens_to_remove)
                if self.truncation_side == "left":
                    overflowing_tokens = ids[:window_len]
                    ids = ids[num_tokens_to_remove:]
                elif self.truncation_side == "right":
                    overflowing_tokens = ids[-window_len:]
                    ids = ids[:-num_tokens_to_remove]
                else:
                    raise ValueError(
                        f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'."
                    )

            else:
                error_msg = (
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
                    f"but the first sequence has a length {len(ids)}. ")
                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                    error_msg = (
                        error_msg +
                        "Please select another truncation strategy than "
                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                    )
                logger.error(error_msg)
        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
            logger.warning(
                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
                "truncation strategy. So the returned list will always be empty even if some "
                "tokens have been removed.")
            for _ in range(num_tokens_to_remove):
                if pair_ids is None or len(ids) > len(pair_ids):
                    if self.truncation_side == "right":
                        ids = ids[:-1]
                    elif self.truncation_side == "left":
                        ids = ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(
                            self.truncation_side))
                else:
                    if self.truncation_side == "right":
                        pair_ids = pair_ids[:-1]
                    elif self.truncation_side == "left":
                        pair_ids = pair_ids[1:]
                    else:
                        raise ValueError("invalid truncation strategy:" + str(
                            self.truncation_side))
        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
            if len(pair_ids) > num_tokens_to_remove:
                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
                if self.truncation_side == "right":
                    overflowing_tokens = pair_ids[-window_len:]
                    pair_ids = pair_ids[:-num_tokens_to_remove]
                elif self.truncation_side == "left":
                    overflowing_tokens = pair_ids[:window_len]
                    pair_ids = pair_ids[num_tokens_to_remove:]
                else:
                    raise ValueError("invalid truncation strategy:" + str(
                        self.truncation_side))
            else:
                logger.error(
                    f"We need to remove {num_tokens_to_remove} to truncate the input "
                    f"but the second sequence has a length {len(pair_ids)}. "
                    f"Please select another truncation strategy than {truncation_strategy}, "
                    "for instance 'longest_first' or 'only_first'.")

        return (ids, pair_ids, overflowing_tokens)

    def prepare_for_model(self,
                          ids,
                          pair_ids=None,
                          add_special_tokens=True,
                          padding=False,
                          truncation=False,
                          max_length=None,
                          stride=0,
                          pad_to_multiple_of=None,
                          return_tensors=None,
                          return_token_type_ids=None,
                          return_attention_mask=None,
                          return_overflowing_tokens=False,
                          return_special_tokens_mask=False,
                          return_offsets_mapping=False,
                          return_length=False,
                          verbose=True,
                          prepend_batch_axis=False,
                          **kwargs):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
        overflowing tokens. Such a combination of arguments will raise an error.

        Args:
            ids (`List[int]`):
                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
                `convert_tokens_to_ids` methods.
            pair_ids (`List[int]`, *optional*):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs, )

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

        if return_token_type_ids and not add_special_tokens:
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None.")

        if (return_overflowing_tokens and
                truncation_strategy == TruncationStrategy.LONGEST_FIRST and
                pair_ids is not None):
            raise ValueError(
                "Not possible to return overflowing tokens for pair of sequences with the "
                "`longest_first`. Please select another truncation strategy than `longest_first`, "
                "for instance `only_second` or `only_first`.")

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
            pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
        overflowing_tokens = []
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride, )

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(
                ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids)
                                               if pair else [])

        # Build output dictionary
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs[
                    "special_tokens_mask"] = self.get_special_tokens_mask(
                        ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
        self._eventual_warn_about_too_long_sequence(
            encoded_inputs["input_ids"], max_length, verbose)

        # Padding
        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding_strategy.value,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask, )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs,
            tensor_type=return_tensors,
            prepend_batch_axis=prepend_batch_axis)
        return batch_outputs

    def _batch_prepare_for_model(
            self,
            batch_ids_pairs,
            add_special_tokens=True,
            padding_strategy=PaddingStrategy.DO_NOT_PAD,
            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
            max_length=None,
            stride=0,
            pad_to_multiple_of=None,
            return_tensors=None,
            return_token_type_ids=None,
            return_attention_mask=None,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            return_length=False,
            verbose=True, ):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        batch_outputs = {}
        for first_ids, second_ids in batch_ids_pairs:
            outputs = self.prepare_for_model(
                first_ids,
                second_ids,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.
                value,  # we pad in batch afterward
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose, )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask, )

        batch_outputs = BatchEncoding(
            batch_outputs, tensor_type=return_tensors)

        return batch_outputs

    def _get_padding_truncation_strategies(self,
                                           padding=False,
                                           truncation=False,
                                           max_length=None,
                                           pad_to_multiple_of=None,
                                           verbose=True,
                                           **kwargs):
        """
        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
        and pad_to_max_length) and behaviors.
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy",
                                             "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

        # Backward compatibility for previous behavior, maybe we should deprecate it:
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
                if not self.deprecation_warnings.get(
                        "Truncation-not-explicitly-activated", False):
                    logger.warning(
                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
                        " `truncation`.")
                self.deprecation_warnings[
                    "Truncation-not-explicitly-activated"] = True
            truncation = "longest_first"

        # Get padding strategy
        if padding is False and old_pad_to_max_length:
            if verbose:
                warnings.warn(
                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
                    "maximal input size of the model (e.g. 512 for Bert).",
                    FutureWarning, )
            if max_length is None:
                padding_strategy = PaddingStrategy.LONGEST
            else:
                padding_strategy = PaddingStrategy.MAX_LENGTH
        elif padding is not False:
            if padding is True:
                if verbose:
                    if max_length is not None and (
                            truncation is False or
                            truncation == "do_not_truncate"):
                        warnings.warn(
                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
                            "To pad to max length, use `padding='max_length'`.")
                    if old_pad_to_max_length is not False:
                        warnings.warn(
                            "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`."
                        )
                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
            elif not isinstance(padding, PaddingStrategy):
                padding_strategy = PaddingStrategy(padding)
            elif isinstance(padding, PaddingStrategy):
                padding_strategy = padding
        else:
            padding_strategy = PaddingStrategy.DO_NOT_PAD

        # Get truncation strategy
        if truncation is False and old_truncation_strategy != "do_not_truncate":
            if verbose:
                warnings.warn(
                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
                    " `truncation=True` to truncate examples to a max length. You can give a specific length with"
                    " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
                    " size of the model (e.g. 512 for Bert).  If you have pairs of inputs, you can give a specific"
                    " truncation strategy selected among `truncation='only_first'` (will only truncate the first"
                    " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
                    " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
                    " in the pairs).",
                    FutureWarning, )
            truncation_strategy = TruncationStrategy(old_truncation_strategy)
        elif truncation is not False:
            if truncation is True:
                truncation_strategy = (
                    TruncationStrategy.LONGEST_FIRST
                )  # Default to truncate the longest sequences in pairs of inputs
            elif not isinstance(truncation, TruncationStrategy):
                truncation_strategy = TruncationStrategy(truncation)
            elif isinstance(truncation, TruncationStrategy):
                truncation_strategy = truncation
        else:
            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

        # Set max length if needed
        if max_length is None:
            if padding_strategy == PaddingStrategy.MAX_LENGTH:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-pad-to-max_length", False):
                            logger.warning(
                                "Asking to pad to max_length but no maximum length is provided and the model has no"
                                " predefined maximum length. Default to no padding."
                            )
                        self.deprecation_warnings[
                            "Asking-to-pad-to-max_length"] = True
                    padding_strategy = PaddingStrategy.DO_NOT_PAD
                else:
                    max_length = self.model_max_length

            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                if self.model_max_length > LARGE_INTEGER:
                    if verbose:
                        if not self.deprecation_warnings.get(
                                "Asking-to-truncate-to-max_length", False):
                            logger.warning(
                                "Asking to truncate to max_length but no maximum length is provided and the model has"
                                " no predefined maximum length. Default to no truncation."
                            )
                        self.deprecation_warnings[
                            "Asking-to-truncate-to-max_length"] = True
                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                else:
                    max_length = self.model_max_length

        # Test if we have a padding token
        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
                not self.pad_token or self.pad_token_id < 0):
            raise ValueError(
                "Asking to pad but the tokenizer does not have a padding token. "
                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
            )

        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
        if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and
                padding_strategy != PaddingStrategy.DO_NOT_PAD and
                pad_to_multiple_of is not None and max_length is not None and
            (max_length % pad_to_multiple_of != 0)):
            raise ValueError(
                "Truncation and padding are both activated but "
                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
            )

        return padding_strategy, truncation_strategy, max_length, kwargs

    def batch_encode_plus(self,
                          batch_text_or_text_pairs,
                          add_special_tokens=True,
                          padding=False,
                          truncation=False,
                          max_length=None,
                          stride=0,
                          is_split_into_words=False,
                          pad_to_multiple_of=None,
                          return_tensors=None,
                          return_token_type_ids=None,
                          return_attention_mask=None,
                          return_overflowing_tokens=False,
                          return_special_tokens_mask=False,
                          return_offsets_mapping=False,
                          return_length=False,
                          verbose=True,
                          **kwargs):
        """
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in `encode_plus`).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs, )

        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs, )

    def _batch_encode_plus(
            self,
            batch_text_or_text_pairs,
            add_special_tokens=True,
            padding_strategy=PaddingStrategy.DO_NOT_PAD,
            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
            max_length=None,
            stride=0,
            is_split_into_words=False,
            pad_to_multiple_of=None,
            return_tensors=None,
            return_token_type_ids=None,
            return_attention_mask=None,
            return_overflowing_tokens=False,
            return_special_tokens_mask=False,
            return_offsets_mapping=False,
            return_length=False,
            verbose=True,
            **kwargs):
        def get_input_ids(text):
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], str):
                if is_split_into_words:
                    tokens = list(
                        itertools.chain(*(self.tokenize(
                            t, is_split_into_words=True, **kwargs)
                                          for t in text)))
                    return self.convert_tokens_to_ids(tokens)
                else:
                    return self.convert_tokens_to_ids(text)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                                text[0], int):
                return text
            else:
                raise ValueError(
                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                )

        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast.")

        input_ids = []
        for ids_or_pair_ids in batch_text_or_text_pairs:
            if not isinstance(ids_or_pair_ids, (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            elif is_split_into_words and not isinstance(ids_or_pair_ids[0],
                                                        (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            else:
                ids, pair_ids = ids_or_pair_ids

            first_ids = get_input_ids(ids)
            second_ids = get_input_ids(
                pair_ids) if pair_ids is not None else None
            input_ids.append((first_ids, second_ids))

        batch_outputs = self._batch_prepare_for_model(
            input_ids,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose, )

        return BatchEncoding(batch_outputs)

    def tokenize(self, text, **kwargs):
        """
        Converts a string in a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            **kwargs (additional keyword arguments):
                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            `List[str]`: The list of tokens.
        """
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
        all_special_tokens_extended = dict(
            (str(t), t) for t in self.all_special_tokens_extended
            if isinstance(t, AddedToken))

        text, kwargs = self.prepare_for_tokenization(text, **kwargs)

        if kwargs:
            logger.warning(f"Keyword arguments {kwargs} not recognized.")

        # TODO: should this be in the base class?
        if hasattr(self, "do_lower_case") and self.do_lower_case:
            # convert non-special tokens to lowercase
            escaped_special_toks = [
                re.escape(s_tok)
                for s_tok in (self.unique_no_split_tokens +
                              self.all_special_tokens)
            ]
            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
            text = re.sub(pattern,
                          lambda m: m.groups()[0] or m.groups()[1].lower(),
                          text)

        no_split_token = set(self.unique_no_split_tokens)
        tokens = self.tokens_trie.split(text)
        # ["This is something", "<special_token_1>", "  else"]
        for i, token in enumerate(tokens):
            if token in no_split_token:
                tok_extended = all_special_tokens_extended.get(token, None)
                left = tokens[i - 1] if i > 0 else None
                right = tokens[i + 1] if i < len(tokens) - 1 else None
                if isinstance(tok_extended, AddedToken):
                    if tok_extended.rstrip and right:
                        # A bit counter-intuitive but we strip the left of the string
                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
                        tokens[i + 1] = right.lstrip()
                    # Strip white spaces on the left
                    if tok_extended.lstrip and left:
                        tokens[i - 1] = left.rstrip()  # Opposite here
                else:
                    # We strip left and right by default
                    if right:
                        tokens[i + 1] = right.lstrip()
                    if left:
                        tokens[i - 1] = left.rstrip()
        # ["This is something", "<special_token_1>", "else"]
        tokenized_text = []
        for token in tokens:
            # Need to skip eventual empty (fully stripped) tokens
            if not token:
                continue
            if token in no_split_token:
                tokenized_text.append(token)
            else:
                tokenized_text.extend(self._tokenize(token))
        # ["This", " is", " something", "<special_token_1>", "else"]
        return tokenized_text

    def _tokenize(self, text):
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        return self.sp_model.encode(text, out_type=str)

    def prepare_for_tokenization(self,
                                 text,
                                 is_split_into_words=False,
                                 **kwargs):
        """
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs:
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        """
        return (text, kwargs)

    def convert_tokens_to_ids(self, tokens):
        """
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        """
        if tokens is None:
            return None

        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)

        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
        return ids

    def _convert_token_to_id_with_added_voc(self, token):
        if token is None:
            return None

        if token in self.added_tokens_encoder:
            return self.added_tokens_encoder[token]
        return self._convert_token_to_id(token)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        if token.startswith("<extra_id_"):
            match = re.match(r"<extra_id_(\d+)>", token)
            num = int(match.group(1))
            return self.vocab_size - num - 1
        return self.sp_model.piece_to_id(token)

    def num_special_tokens_to_add(self, pair=False):
        """
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        """
        token_ids_0 = []
        token_ids_1 = []
        return len(
            self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
                                                  if pair else None))

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:
            return token_ids_0
        else:
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            return token_ids_0 + token_ids_1

    @staticmethod
    def _eventually_correct_t5_max_length(pretrained_model_name_or_path,
                                          max_model_length,
                                          init_max_model_length):
        if pretrained_model_name_or_path in T5Tokenizer.max_model_input_sizes:
            deprecated_max_model_length = T5Tokenizer.max_model_input_sizes[
                pretrained_model_name_or_path]
            if init_max_model_length is not None and init_max_model_length != max_model_length:
                return init_max_model_length
            elif init_max_model_length is None:
                warnings.warn(
                    "This tokenizer was incorrectly instantiated with a model max length of"
                    f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this"
                    " behavior is kept to avoid breaking backwards compatibility when padding/encoding with"
                    " `truncation is True`.\n- Be aware that you SHOULD NOT rely on"
                    f" {pretrained_model_name_or_path} automatically truncating your input to"
                    f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences"
                    f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with"
                    " `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please"
                    " instantiate this tokenizer with `model_max_length` set to your preferred value.",
                    FutureWarning, )

        return max_model_length

    @property
    def vocab_size(self):
        return self.sp_model.get_piece_size() + self._extra_ids

    def get_vocab(self):
        vocab = {
            self.convert_ids_to_tokens(i): i
            for i in range(self.vocab_size)
        }
        vocab.update(self.added_tokens_encoder)
        return vocab

    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False):
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True)

        # normal case: some special tokens
        if token_ids_1 is None:
            return ([0] * len(token_ids_0)) + [1]
        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

    def _add_eos_if_not_present(self, token_ids):
        """Do not add eos again if user already added it."""
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            warnings.warn(
                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
                " eos tokens being added.")
            return token_ids
        else:
            return token_ids + [self.eos_token_id]

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]` of zeros.
        """
        eos = [self.eos_token_id]

        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]
        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]` of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:
            return token_ids_0
        else:
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            return token_ids_0 + token_ids_1

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d

        # for backward compatibility
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    def _tokenize(self, text: str):
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        if token.startswith("<extra_id_"):
            match = re.match(r"<extra_id_(\d+)>", token)
            num = int(match.group(1))
            return self.vocab_size - num - 1
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index < self.sp_model.get_piece_size():
            token = self.sp_model.IdToPiece(index)
        else:
            token = f"<extra_id_{self.vocab_size - 1 - index}>"
        return token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        for token in tokens:
            # make sure that special tokens are not decoded using sentencepiece model
            if token in self.all_special_tokens:
                out_string += self.sp_model.decode_pieces(
                    current_sub_tokens) + token + " "
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
        out_string += self.sp_model.decode_pieces(current_sub_tokens)
        return out_string.strip()

    def save_vocabulary(self, save_directory, filename_prefix=None):
        if not os.path.isdir(save_directory):
            logger.error(
                f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory,
            (filename_prefix + "-"
             if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(
                out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        return (out_vocab_file, )

    @classmethod
    def _dict_from_json_file(cls, json_file):
        with open(json_file, "r", encoding="utf-8") as reader:
            text = reader.read()
        return json.loads(text)


================================================
FILE: ppfleetx/data/tokenizers/tokenization_utils_base.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
of output with special method for the Fast tokenizers)
"""

import copy
import json
import os
import re
import warnings
from collections import OrderedDict, UserDict
from collections.abc import Mapping
from contextlib import contextmanager
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
import importlib

import numpy as np


def is_sentencepiece_available():
    return importlib.util.find_spec("sentencepiece") is not None


def is_tokenizers_available():
    return importlib.util.find_spec("tokenizers") is not None


if is_tokenizers_available():
    from tokenizers import AddedToken
else:

    @dataclass(frozen=True, eq=True)
    class AddedToken:
        """
        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
        way it should behave.
        """

        content: str = field(default_factory=str)
        single_word: bool = False
        lstrip: bool = False
        rstrip: bool = False
        normalized: bool = True

        def __getstate__(self):
            return self.__dict__


TOKENIZER_MAPPING_NAMES = OrderedDict([
    (
        "albert",
        (
            "AlbertTokenizer" if is_sentencepiece_available() else None,
            "AlbertTokenizerFast" if is_tokenizers_available() else None, ), ),
    ("bart", ("BartTokenizer", "BartTokenizerFast")),
    (
        "barthez",
        (
            "BarthezTokenizer" if is_sentencepiece_available() else None,
            "BarthezTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("bartpho", ("BartphoTokenizer", None)),
    ("bert", ("BertTokenizer", "BertTokenizerFast"
              if is_tokenizers_available() else None)),
    ("bert-generation", ("BertGenerationTokenizer"
                         if is_sentencepiece_available() else None, None)),
    ("bert-japanese", ("BertJapaneseTokenizer", None)),
    ("bertweet", ("BertweetTokenizer", None)),
    (
        "big_bird",
        (
            "BigBirdTokenizer" if is_sentencepiece_available() else None,
            "BigBirdTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast"
                         if is_tokenizers_available() else None)),
    ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
    ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
    ("bloom", (None, "BloomTokenizerFast"
               if is_tokenizers_available() else None)),
    ("byt5", ("ByT5Tokenizer", None)),
    (
        "camembert",
        (
            "CamembertTokenizer" if is_sentencepiece_available() else None,
            "CamembertTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("canine", ("CanineTokenizer", None)),
    (
        "clip",
        (
            "CLIPTokenizer",
            "CLIPTokenizerFast" if is_tokenizers_available() else None, ), ),
    ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast"
                  if is_tokenizers_available() else None)),
    (
        "cpm",
        (
            "CpmTokenizer" if is_sentencepiece_available() else None,
            "CpmTokenizerFast" if is_tokenizers_available() else None, ), ),
    ("ctrl", ("CTRLTokenizer", None)),
    ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast"
                       if is_tokenizers_available() else None)),
    ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast"
                 if is_tokenizers_available() else None)),
    (
        "deberta-v2",
        (
            "DebertaV2Tokenizer" if is_sentencepiece_available() else None,
            "DebertaV2TokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast"
                    if is_tokenizers_available() else None)),
    (
        "dpr",
        (
            "DPRQuestionEncoderTokenizer",
            "DPRQuestionEncoderTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("electra", ("ElectraTokenizer", "ElectraTokenizerFast"
                 if is_tokenizers_available() else None)),
    ("flaubert", ("FlaubertTokenizer", None)),
    ("fnet", ("FNetTokenizer", "FNetTokenizerFast"
              if is_tokenizers_available() else None)),
    ("fsmt", ("FSMTTokenizer", None)),
    ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast"
                if is_tokenizers_available() else None)),
    ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast"
              if is_tokenizers_available() else None)),
    ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast"
                 if is_tokenizers_available() else None)),
    ("gpt_neox", (None, "GPTNeoXTokenizerFast"
                  if is_tokenizers_available() else None)),
    ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast"
              if is_tokenizers_available() else None)),
    ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast"
                 if is_tokenizers_available() else None)),
    ("hubert", ("Wav2Vec2CTCTokenizer", None)),
    ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast"
               if is_tokenizers_available() else None)),
    ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast"
                  if is_tokenizers_available() else None)),
    ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast"
                    if is_tokenizers_available() else None)),
    ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast"
                    if is_tokenizers_available() else None)),
    ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast"
                   if is_tokenizers_available() else None)),
    ("led", ("LEDTokenizer", "LEDTokenizerFast"
             if is_tokenizers_available() else None)),
    ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast"
                    if is_tokenizers_available() else None)),
    (
        "longt5",
        (
            "T5Tokenizer" if is_sentencepiece_available() else None,
            "T5TokenizerFast" if is_tokenizers_available() else None, ), ),
    ("luke", ("LukeTokenizer", None)),
    ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast"
                if is_tokenizers_available() else None)),
    ("m2m_100", ("M2M100Tokenizer"
                 if is_sentencepiece_available() else None, None)),
    ("marian", ("MarianTokenizer"
                if is_sentencepiece_available() else None, None)),
    (
        "mbart",
        (
            "MBartTokenizer" if is_sentencepiece_available() else None,
            "MBartTokenizerFast" if is_tokenizers_available() else None, ), ),
    (
        "mbart50",
        (
            "MBart50Tokenizer" if is_sentencepiece_available() else None,
            "MBart50TokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("megatron-bert", ("BertTokenizer", "BertTokenizerFast"
                       if is_tokenizers_available() else None)),
    ("mluke", ("MLukeTokenizer"
               if is_sentencepiece_available() else None, None)),
    ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast"
                    if is_tokenizers_available() else None)),
    ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast"
               if is_tokenizers_available() else None)),
    (
        "mt5",
        (
            "MT5Tokenizer" if is_sentencepiece_available() else None,
            "MT5TokenizerFast" if is_tokenizers_available() else None, ), ),
    (
        "nystromformer",
        (
            "AlbertTokenizer" if is_sentencepiece_available() else None,
            "AlbertTokenizerFast" if is_tokenizers_available() else None, ), ),
    ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast"
                    if is_tokenizers_available() else None)),
    ("opt", ("GPT2Tokenizer", None)),
    (
        "pegasus",
        (
            "PegasusTokenizer" if is_sentencepiece_available() else None,
            "PegasusTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    (
        "perceiver",
        (
            "PerceiverTokenizer",
            None, ), ),
    ("phobert", ("PhobertTokenizer", None)),
    ("plbart", ("PLBartTokenizer"
                if is_sentencepiece_available() else None, None)),
    ("prophetnet", ("ProphetNetTokenizer", None)),
    ("qdqbert", ("BertTokenizer", "BertTokenizerFast"
                 if is_tokenizers_available() else None)),
    ("rag", ("RagTokenizer", None)),
    ("realm", ("RealmTokenizer", "RealmTokenizerFast"
               if is_tokenizers_available() else None)),
    (
        "reformer",
        (
            "ReformerTokenizer" if is_sentencepiece_available() else None,
            "ReformerTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    (
        "rembert",
        (
            "RemBertTokenizer" if is_sentencepiece_available() else None,
            "RemBertTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast"
                   if is_tokenizers_available() else None)),
    ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast"
                 if is_tokenizers_available() else None)),
    ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast"
                  if is_tokenizers_available() else None)),
    ("speech_to_text", ("Speech2TextTokenizer"
                        if is_sentencepiece_available() else None, None)),
    ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
    ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
    (
        "squeezebert",
        ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast"
         if is_tokenizers_available() else None), ),
    (
        "t5",
        (
            "T5Tokenizer" if is_sentencepiece_available() else None,
            "T5TokenizerFast" if is_tokenizers_available() else None, ), ),
    ("tapas", ("TapasTokenizer", None)),
    ("tapex", ("TapexTokenizer", None)),
    ("transfo-xl", ("TransfoXLTokenizer", None)),
    ("vilt", ("BertTokenizer", "BertTokenizerFast"
              if is_tokenizers_available() else None)),
    ("visual_bert", ("BertTokenizer", "BertTokenizerFast"
                     if is_tokenizers_available() else None)),
    ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
    ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
    ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
    (
        "xglm",
        (
            "XGLMTokenizer" if is_sentencepiece_available() else None,
            "XGLMTokenizerFast" if is_tokenizers_available() else None, ), ),
    ("xlm", ("XLMTokenizer", None)),
    ("xlm-prophetnet", ("XLMProphetNetTokenizer"
                        if is_sentencepiece_available() else None, None)),
    (
        "xlm-roberta",
        (
            "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
            "XLMRobertaTokenizerFast"
            if is_tokenizers_available() else None, ), ),
    ("xlm-roberta-xl", ("RobertaTokenizer", "RobertaTokenizerFast"
                        if is_tokenizers_available() else None)),
    (
        "xlnet",
        (
            "XLNetTokenizer" if is_sentencepiece_available() else None,
            "XLNetTokenizerFast" if is_tokenizers_available() else None, ), ),
    (
        "yoso",
        (
            "AlbertTokenizer" if is_sentencepiece_available() else None,
            "AlbertTokenizerFast" if is_tokenizers_available() else None, ), ),
])

SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([
    ("openai-gpt", "openai"),
    ("data2vec-audio", "data2vec"),
    ("data2vec-text", "data2vec"),
    ("data2vec-vision", "data2vec"),
])


def model_type_to_module_name(key):
    """Converts a config key to the corresponding module."""
    # Special treatment
    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
        return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]

    return key.replace("-", "_")


class _LazyConfigMapping(OrderedDict):
    """
    A dictionary that lazily load its values when they are requested.
    """

    def __init__(self, mapping):
        self._mapping = mapping
        self._extra_content = {}
        self._modules = {}

    def __getitem__(self, key):
        if key in self._extra_content:
            return self._extra_content[key]
        if key not in self._mapping:
            raise KeyError(key)
        value = self._mapping[key]
        module_name = model_type_to_module_name(key)
        if module_name not in self._modules:

            self._modules[module_name] = importlib.import_module(
                f".{module_name}", "transformers.models")
        if hasattr(self._modules[module_name], value):
            return getattr(self._modules[module_name], value)

        # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the
        # object at the top level.
        transformers_module = importlib.import_module("transformers")
        return getattr(transformers_module, value)

    def keys(self):
        return list(self._mapping.keys()) + list(self._extra_content.keys())

    def values(self):
        return [self[k] for k in self._mapping.keys()] + list(
            self._extra_content.values())

    def items(self):
        return [(k, self[k]) for k in self._mapping.keys()] + list(
            self._extra_content.items())

    def __iter__(self):
        return iter(
            list(self._mapping.keys()) + list(self._extra_content.keys()))

    def __contains__(self, item):
        return item in self._mapping or item in self._extra_content

    def register(self, key, value):
        """
        Register a new configuration in this mapping.
        """
        if key in self._mapping.keys():
            raise ValueError(
                f"'{key}' is already used by a Transformers config, pick another name."
            )
        self._extra_content[key] = value


class Trie:
    """
    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
    Loose reference https://en.wikipedia.org/wiki/Trie
    """

    def __init__(self):
        self.data = {}

    def add(self, word: str):
        """
        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
        The special key `""` is used to represent termination.

        This function is idempotent, adding twice the same word will leave the trie unchanged

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        """
        if not word:
            # Prevent empty string
            return
        ref = self.data
        for char in word:
            ref[char] = char in ref and ref[char] or {}
            ref = ref[char]
        ref[""] = 1

    def split(self, text: str) -> List[str]:
        """
        Will look for the words added to the trie within `text`. Output is the original string splitted along the
        boundaries of the words found.

        This trie will match the longest possible word first !

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS] This is a extra_id_100"]

        >>> trie.add("[CLS]")
        >>> trie.add("extra_id_1")
        >>> trie.add("extra_id_100")
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS]", " This is a ", "extra_id_100"]
        ```
        """
        # indexes are counted left of the chars index.
        # "hello", index 0, is left of h, index 1 is between h and e.
        # index 5 is right of the "o".

        # States are going to capture every possible start (indexes as above)
        # as keys, and have as values, a pointer to the position in the trie
        # where we're at. This is a partial match for now.
        # This enables to keep track of multiple matches while we're iterating
        # the string
        # If the trie contains, "blowing", and "lower" and we encounter the
        # string "blower", we need to split into ["b", "lower"].
        # This is where we need to keep track of multiple possible starts.
        states = OrderedDict()

        # This will contain every indices where we need
        # to cut.
        # We force to cut at offset 0 and len(text) (added later)
        offsets = [0]

        # This is used by the lookahead which needs to skip over
        # some text where the full match exceeded the place in the initial
        # for loop
        skip = 0
        # Main loop, Giving this algorithm O(n) complexity
        for current, current_char in enumerate(text):
            if skip and current < skip:
                # Prevents the lookahead for matching twice
                # like extra_id_100 and id_100
                continue

            # This will track every state
            # that stop matching, we need to stop tracking them.
            # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
            # fail on "b", we need to remove 0 from the valid states.
            to_remove = set()
            # Whenever we found a match, we need to drop everything
            # this is a greedy algorithm, it will match on the first found token
            reset = False

            # In this case, we already have partial matches (But unfinished)
            for start, trie_pointer in states.items():
                if "" in trie_pointer:
                    # This is a final match, we need to reset and
                    # store the results in `offsets`.

                    # Lookahead to match longest first
                    # Important in case of extra_id_1 vs extra_id_100
                    # Here we are also actively looking for other earlier partial
                    # matches
                    # "[CLS]", "L", we need to match CLS even if L is special
                    for lookstart, looktrie_pointer in states.items():
                        if lookstart > start:
                            # This partial match is later, we can stop looking
                            break
                        elif lookstart < start:
                            # This partial match is earlier, the trie pointer
                            # was already updated, so index is + 1
                            lookahead_index = current + 1
                            end = current + 1
                        else:
                            # Here lookstart == start and
                            #      looktrie_pointer == trie_pointer
                            # It wasn't updated yet so indices are current ones
                            lookahead_index = current
                            end = current
                        next_char = text[
                            lookahead_index] if lookahead_index < len(
                                text) else None
                        if "" in looktrie_pointer:
                            start = lookstart
                            end = lookahead_index
                            skip = lookahead_index

                        while next_char in looktrie_pointer:
                            looktrie_pointer = looktrie_pointer[next_char]
                            lookahead_index += 1
                            if "" in looktrie_pointer:
                                start = lookstart
                                end = lookahead_index
                                skip = lookahead_index

                            if lookahead_index == len(text):
                                # End of string
                                break
                            next_char = text[lookahead_index]
                        # End lookahead

                        # Storing and resetting
                    offsets.append(start)
                    offsets.append(end)
                    reset = True
                    break
                elif current_char in trie_pointer:
                    # The current character being looked at has a match within the trie
                    # update the pointer (it will be stored back into states later).
                    trie_pointer = trie_pointer[current_char]

                    # Storing back the new pointer into the states.
                    # Partial matches got longer by one.
                    states[start] = trie_pointer
                else:
                    # The new character has not match in the trie, we need
                    # to stop keeping track of this partial match.
                    # We can't do it directly within the loop because of how
                    # python iteration works
                    to_remove.add(start)

            # Either clearing the full start (we found a real match)
            # Or clearing only the partial matches that didn't work.
            if reset:
                states = {}
            else:
                for start in to_remove:
                    del states[start]

            # If this character is a starting character within the trie
            # start keeping track of this partial match.
            if current >= skip and current_char in self.data:
                states[current] = self.data[current_char]

        # We have a cut at the end with states.
        for start, trie_pointer in states.items():
            if "" in trie_pointer:
                # This is a final match, we need to reset and
                # store the results in `offsets`.
                end = len(text)
                offsets.append(start)
                offsets.append(end)
                # Longest cut is always the one with lower start so the first
                # item so we need to break.
                break

        return self.cut_text(text, offsets)

    def cut_text(self, text, offsets):
        # We have all the offsets now, we just need to do the actual splitting.
        # We need to eventually add the first part of the string and the eventual
        # last part.
        offsets.append(len(text))
        tokens = []
        start = 0
        for end in offsets:
            if start > end:
                logger.error(
                    "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
                    " anyway.")
                continue
            elif start == end:
                # This might happen if there's a match at index 0
                # we're also preventing zero-width cuts in case of two
                # consecutive matches
                continue
            tokens.append(text[start:end])
            start = end

        return tokens


from enum import Enum


class ExplicitEnum(Enum):
    """
    Enum with more explicit error message for missing values.
    """

    @classmethod
    def _missing_(cls, value):
        raise ValueError(
            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
        )


class TensorType(ExplicitEnum):
    """
    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
    tab-completion in an IDE.
    """

    PADDLE = "paddle"
    PYTORCH = "pt"
    TENSORFLOW = "tf"
    NUMPY = "np"
    JAX = "jax"


class BatchEncoding(UserDict):
    """
    Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
    [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and
    [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).

    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
    utility methods to map from word/character space to token space.

    Args:
        data (`dict`):
            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
            ('input_ids', 'attention_mask', etc.).
        encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
            space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
            information.
        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
        n_sequences (`Optional[int]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
    """

    def __init__(
            self,
            data=None,
            encoding=None,
            tensor_type=None,
            prepend_batch_axis: bool=False,
            n_sequences=None, ):
        super().__init__(data)

        #if isinstance(encoding, EncodingFast):
        #    encoding = [encoding]

        self._encodings = encoding

        if n_sequences is None and encoding is not None and len(encoding):
            n_sequences = encoding[0].n_sequences

        self._n_sequences = n_sequences

        self.convert_to_tensors(
            tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

    @property
    def n_sequences(self) -> Optional[int]:
        """
        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
        sentences)
        """
        return self._n_sequences

    @property
    def is_fast(self) -> bool:
        """
        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
        or not.
        """
        return self._encodings is not None

# def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:

    def __getitem__(self, item):
        """
        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
        etc.).

        If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
        """
        if isinstance(item, str):
            return self.data[item]
        elif self._encodings is not None:
            return self._encodings[item]
        else:
            raise KeyError(
                "Indexing with integers (to access backend Encoding for a given batch index) "
                "is not available when using Python based tokenizers")

    def __getattr__(self, item: str):
        try:
            return self.data[item]
        except KeyError:
            raise AttributeError

    def __getstate__(self):
        return {"data": self.data, "encodings": self._encodings}

    def __setstate__(self, state):
        if "data" in state:
            self.data = state["data"]

        if "encodings" in state:
            self._encodings = state["encodings"]

    def keys(self):
        return self.data.keys()

    def values(self):
        return self.data.values()

    def items(self):
        return self.data.items()

    # After this point:
    # Extended properties and methods only available for fast (Rust-based) tokenizers
    # provided by HuggingFace tokenizers library.

    @property
    def encodings(self):
        """
        `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
        the input was tokenized through Python (i.e., not a fast) tokenizer.
        """
        return self._encodings

    def tokens(self, batch_index=0):
        """
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
        integer indices) at a given batch index (only works for the output of a fast tokenizer).

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[str]`: The list of tokens at that index.
        """
        if not self._encodings:
            raise ValueError(
                "tokens() is not available when using Python-based tokenizers")
        return self._encodings[batch_index].tokens

    def sequence_ids(self, batch_index=0):
        """
        Return a list mapping the tokens to the id of their original sentences:

            - `None` for special tokens added around or between sequences,
            - `0` for tokens corresponding to words in the first sequence,
            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
              encoded.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
            by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
            sequence.
        """
        if not self._encodings:
            raise ValueError(
                "sequence_ids() is not available when using Python-based tokenizers"
            )
        return self._encodings[batch_index].sequence_ids

    def words(self, batch_index=0):
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
        """
        if not self._encodings:
            raise ValueError(
                "words() is not available when using Python-based tokenizers")
        warnings.warn(
            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
            "but more self-explanatory `BatchEncoding.word_ids()` property.",
            FutureWarning, )
        return self.word_ids(batch_index)

    def word_ids(self, batch_index: int=0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
        """
        if not self._encodings:
            raise ValueError(
                "word_ids() is not available when using Python-based tokenizers"
            )
        return self._encodings[batch_index].word_ids

    def token_to_sequence(self, batch_or_token_index, token_index):
        """
        Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
        for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair

        Can be called as:

        - `self.token_to_sequence(token_index)` if batch size is 1
        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
            batch_or_token_index (`int`):
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the token in the sequence.
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
                sequence.

        Returns:
            `int`: Index of the word in the input sequence.
        """

        if not self._encodings:
            raise ValueError(
                "token_to_sequence() is not available when using Python based tokenizers"
            )
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_sequence(token_index)

    def token_to_word(self, batch_or_token_index, token_index=None):
        """
        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.

        Can be called as:

        - `self.token_to_word(token_index)` if batch size is 1
        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
            batch_or_token_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
                sequence.

        Returns:
            `int`: Index of the word in the input sequence.
        """

        if not self._encodings:
            raise ValueError(
                "token_to_word() is not available when using Python based tokenizers"
            )
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index
        return self._encodings[batch_index].token_to_word(token_index)

    def word_to_tokens(self,
                       batch_or_word_index,
                       word_index=None,
                       sequence_index=0):
        """
        Get the encoded token span corresponding to a word in a sequence of the batch.

        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:

        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.

        Can be called as:

        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
          1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.

        Args:
            batch_or_word_index (`int`):
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the word in the sequence.
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                sequence.
            sequence_index (`int`, *optional*, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.

        Returns:
            Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if
            no tokens correspond to the word.
        """

        if not self._encodings:
            raise ValueError(
                "word_to_tokens() is not available when using Python based tokenizers"
            )
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if word_index < 0:
            word_index = self._seq_len + word_index
        span = self._encodings[batch_index].word_to_tokens(word_index,
                                                           sequence_index)
        return TokenSpan(*span) if span is not None else None

    def token_to_chars(self, batch_or_token_index: int, token_index=None):
        """
        Get the character span corresponding to an encoded token in a sequence of the batch.

        Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:

        - **start** -- Index of the first character in the original string associated to the token.
        - **end** -- Index of the character following the last character in the original string associated to the
          token.

        Can be called as:

        - `self.token_to_chars(token_index)` if batch size is 1
        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1

        Args:
            batch_or_token_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
                the sequence.

        Returns:
            [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
            (e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
        """

        if not self._encodings:
            raise ValueError(
                "token_to_chars() is not available when using Python based tokenizers"
            )
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        span_indices = self._encodings[batch_index].token_to_chars(token_index)

        return CharSpan(*span_indices) if span_indices is not None else None

    def char_to_token(self,
                      batch_or_char_index: int,
                      char_index: Optional[int]=None,
                      sequence_index: int=0) -> int:
        """
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.

        Can be called as:

        - `self.char_to_token(char_index)` if batch size is 1
        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.

        Args:
            batch_or_char_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                sequence.
            sequence_index (`int`, *optional*, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.


        Returns:
            `int`: Index of the token.
        """

        if not self._encodings:
            raise ValueError(
                "char_to_token() is not available when using Python based tokenizers"
            )
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
        return self._encodings[batch_index].char_to_token(char_index,
                                                          sequence_index)

    def word_to_chars(self,
                      batch_or_word_index: int,
                      word_index: Optional[int]=None,
                      sequence_index: int=0):
        """
        Get the character span in the original string corresponding to given word in a sequence of the batch.

        Character spans are returned as a CharSpan NamedTuple with:

        - start: index of the first character in the original string
        - end: index of the character following the last character in the original string

        Can be called as:

        - `self.word_to_chars(word_index)` if batch size is 1
        - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1

        Args:
            batch_or_word_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                sequence.
            sequence_index (`int`, *optional*, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.

        Returns:
            `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
            are NamedTuple with:

                - start: index of the first character associated to the token in the original string
                - end: index of the character following the last character associated to the token in the original
                  string
        """

        if not self._encodings:
            raise ValueError(
                "word_to_chars() is not available when using Python based tokenizers"
            )
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        return CharSpan(*(self._encodings[batch_index].word_to_chars(
            word_index, sequence_index)))

    def char_to_word(self,
                     batch_or_char_index: int,
                     char_index: Optional[int]=None,
                     sequence_index: int=0) -> int:
        """
        Get the word in the original string corresponding to a character in the original string of a sequence of the
        batch.

        Can be called as:

        - `self.char_to_word(char_index)` if batch size is 1
        - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.

        Args:
            batch_or_char_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the character in the original string.
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
                original string.
            sequence_index (`int`, *optional*, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.


        Returns:
            `int` or `List[int]`: Index or indices of the associated encoded token(s).
        """

        if not self._encodings:
            raise ValueError(
                "char_to_word() is not available when using Python based tokenizers"
            )
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index
        return self._encodings[batch_index].char_to_word(char_index,
                                                         sequence_index)

    def convert_to_tensors(self,
                           tensor_type=None,
                           prepend_batch_axis: bool=False):
        """
        Convert the inner content to tensors.

        Args:
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
            prepend_batch_axis (`int`, *optional*, defaults to `False`):
                Whether or not to add the batch dimension during the conversion.
        """
        if tensor_type is None:
            return self

        # Get a function reference for the correct framework
        if tensor_type == 'paddle':
            import paddle

            as_tensor = paddle.to_tensor
            is_tensor = paddle.is_tensor
        else:
            as_tensor = np.asarray
            is_tensor = _is_numpy
        # (mfuntowicz: This code is unreachable)
        # else:
        #     raise ImportError(
        #         f"Unable to convert output to tensors format {tensor_type}"
        #     )

        # Do the tensor conversion in batch
        for key, value in self.items():
            try:
                if prepend_batch_axis:
                    value = [value]

                if not is_tensor(value):
                    tensor = as_tensor(value)

                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                    # # at-least2d
                    # if tensor.ndim > 2:
                    #     tensor = tensor.squeeze(0)
                    # elif tensor.ndim < 2:
                    #     tensor = tensor[None, :]

                    self[key] = tensor
            except:  # noqa E722
                if key == "overflowing_tokens":
                    raise ValueError(
                        "Unable to create tensor returning overflowing tokens of different lengths. "
                        "Please see if a fast version of this tokenizer is available to have this feature available."
                    )
                raise ValueError(
                    "Unable to create tensor, you should probably activate truncation and/or padding "
                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
                )

        return self


class TruncationStrategy(ExplicitEnum):
    """
    Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in
    an IDE.
    """

    ONLY_FIRST = "only_first"
    ONLY_SECOND = "only_second"
    LONGEST_FIRST = "longest_first"
    DO_NOT_TRUNCATE = "do_not_truncate"


class PaddingStrategy(ExplicitEnum):
    """
    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
    IDE.
    """

    LONGEST = "longest"
    MAX_LENGTH = "max_length"
    DO_NOT_PAD = "do_not_pad"


class SpecialTokensMixin:
    """
    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
    special tokens. In particular, this class hold the attributes which can be used to directly access these special
    tokens in a model-independent manner and allow to set and update the special tokens.

    Args:
        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the beginning of a sentence.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the end of a sentence.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing an out-of-vocabulary token.
        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token separating two different sentences in the same input (used by BERT for instance).
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the class of the input (used by BERT for instance).
        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
            A tuple or a list of additional special tokens.
    """

    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
        "eos_token",
        "unk_token",
        "sep_token",
        "pad_token",
        "cls_token",
        "mask_token",
        "additional_special_tokens",
    ]

    def __init__(self, verbose=True, **kwargs):
        self._bos_token = None
        self._eos_token = None
        self._unk_token = None
        self._sep_token = None
        self._pad_token = None
        self._cls_token = None
        self._mask_token = None
        self._pad_token_type_id = 0
        self._additional_special_tokens = []
        self.verbose = verbose
        self.added_tokens_encoder: Dict[str, int] = {}
        self.added_tokens_decoder: Dict[int, str] = {}
        self.unique_no_split_tokens: List[str] = []
        self.tokens_trie = Trie()

        self._decode_use_source_tokenizer = False

        # We directly set the hidden value to allow initialization with special tokens
        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
        # TODO clean this up at some point (probably by switching to fast tokenizers)
        for key, value in kwargs.items():
            if value is None:
                continue
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == "additional_special_tokens":
                    assert isinstance(value, (
                        list, tuple)), f"Value {value} is not a list or tuple"
                    assert all(
                        isinstance(t, (str, AddedToken)) for t in value
                    ), "One of the tokens is not a string or an AddedToken"
                    setattr(self, key, value)
                elif isinstance(value, (str, AddedToken)):
                    setattr(self, key, value)
                else:
                    raise TypeError(
                        f"special token {key} has to be either str or AddedToken but got: {type(value)}"
                    )

    def convert_tokens_to_ids(
            self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        """
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        """
        if tokens is None:
            return None

        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)

        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
        return ids

    def _convert_token_to_id_with_added_voc(self, token):
        if token is None:
            return None

        if token in self.added_tokens_encoder:
            return self.added_tokens_encoder[token]
        return self._convert_token_to_id(token)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        if token.startswith("<extra_id_"):
            match = re.match(r"<extra_id_(\d+)>", token)
            num = int(match.group(1))
            return self.vocab_size - num - 1
        return self.sp_model.piece_to_id(token)

    def sanitize_special_tokens(self) -> int:
        """
        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
        `tokenizer.cls_token`, etc.) are in the vocabulary.

        Add the missing ones to the vocabulary if needed.

        Return:
            `int`: The number of tokens added in the vocabulary during the operation.
        """
        return self.add_tokens(
            self.all_special_tokens_extended, special_tokens=True)

    def add_special_tokens(
            self,
            special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
        """
        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
        current vocabulary).

        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
        matrix of the model so that its embedding matrix matches the tokenizer.

        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.

        Using `add_special_tokens` will ensure your special tokens can be used in several ways:

        - Special tokens are carefully handled by the tokenizer (they are never split).
        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
          makes it easy to develop model-agnostic training and fine-tuning scripts.

        When possible, special tokens are already registered for provided pretrained models (for instance
        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
        `'</s>'`).

        Args:
            special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):
                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].

                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                assign the index of the `unk_token` to them).

        Returns:
            `int`: Number of tokens added to the vocabulary.

        Examples:

        ```python
        # Let's see how to add a new classification token to GPT-2
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        model = GPT2Model.from_pretrained("gpt2")

        special_tokens_dict = {"cls_token": "<CLS>"}

        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))

        assert tokenizer.cls_token == "<CLS>"
        ```"""
        if not special_tokens_dict:
            return 0

        added_tokens = 0
        for key, value in special_tokens_dict.items():
            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"

            if self.verbose:
                #logger.info(f"Assigning {value} to the {key} key of the tokenizer")
                print(f"Assigning {value} to the {key} key of the tokenizer")
            setattr(self, key, value)

            if key == "additional_special_tokens":
                assert isinstance(value, (list, tuple)) and all(
                    isinstance(t, (str, AddedToken)) for t in value
                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
                added_tokens += self.add_tokens(value, special_tokens=True)
            else:
                assert isinstance(
                    value, (str, AddedToken)
                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
                added_tokens += self.add_tokens([value], special_tokens=True)

        return added_tokens

    def add_tokens(
            self,
            new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]],
            special_tokens: bool=False) -> int:
        """
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary.

        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
        matrix of the model so that its embedding matrix matches the tokenizer.

        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.

        Args:
            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
                token to let you personalize its behavior: whether this token should only match against a single word,
                whether this token should strip all potential whitespaces on the left side, whether this token should
                strip all potential whitespaces on the right side, etc.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).

                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.

        Returns:
            `int`: Number of tokens added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
        if not new_tokens:
            return 0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]

        return self._add_tokens(new_tokens, special_tokens=special_tokens)

    def _add_tokens(self,
                    new_tokens: Union[List[str], List[AddedToken]],
                    special_tokens: bool=False) -> int:
        new_tokens = [str(tok) for tok in new_tokens]

        tokens_to_add = []
        for token in new_tokens:
            if not isinstance(token, str):
                raise TypeError(
                    f"Token {token} is not a string but a {type(token)}.")
            if not special_tokens and hasattr(
                    self, "do_lower_case") and self.do_lower_case:
                token = token.lower()
            if (token != self.unk_token and self.convert_tokens_to_ids(token)
                    == self.convert_tokens_to_ids(self.unk_token) and
                    token not in tokens_to_add):
                tokens_to_add.append(token)
                #if self.verbose:
            #logger.info(f"Adding {token} to the vocabulary")
            #print(f"Adding {token} to the vocabulary")

        added_tok_encoder = dict((tok, len(self) + i)
                                 for i, tok in enumerate(tokens_to_add))
        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
        self.added_tokens_encoder.update(added_tok_encoder)
        self.added_tokens_decoder.update(added_tok_decoder)

        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
        if special_tokens:
            if len(new_tokens) == 1:
                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,
                                                  new_tokens[0])
            else:
                self.unique_no_split_tokens = sorted(
                    set(self.unique_no_split_tokens).union(set(new_tokens)))
        else:
            # Or on the newly added tokens
            if len(tokens_to_add) == 1:
                _insert_one_token_to_ordered_list(self.unique_no_split_tokens,
                                                  tokens_to_add[0])
            else:
                self.unique_no_split_tokens = sorted(
                    set(self.unique_no_split_tokens).union(
                        set(tokens_to_add)))
        self._create_trie(self.unique_no_split_tokens)

        return len(tokens_to_add)

    def _create_trie(self, unique_no_split_tokens):
        trie = Trie()
        for token in unique_no_split_tokens:
            if hasattr(
                    self, "do_lower_case"
            ) and self.do_lower_case and token not in self.all_special_tokens:
                trie.add(token.lower())
            else:
                trie.add(token)
        self.tokens_trie = trie

    @property
    def bos_token(self) -> str:
        """
        `str`: Beginning of sentence token. Log an error if used while not having been set.
        """
        if self._bos_token is None and self.verbose:
            print("Using bos_token, but it is not set yet.")
            #logger.error("Using bos_token, but it is not set yet.")
            return None
        return str(self._bos_token)

    @property
    def eos_token(self) -> str:
        """
        `str`: End of sentence token. Log an error if used while not having been set.
        """
        if self._eos_token is None and self.verbose:
            #logger.error("Using eos_token, but it is not set yet.")
            print("Using eos_token, but it is not set yet.")
            return None
        return str(self._eos_token)

    @property
    def unk_token(self) -> str:
        """
        `str`: Unknown token. Log an error if used while not having been set.
        """
        if self._unk_token is None and self.verbose:
            print("Using unk_token, but it is not set yet.")
            #logger.error("Using unk_token, but it is not set yet.")
            return None
        return str(self._unk_token)

    @property
    def sep_token(self) -> str:
        """
        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
        having been set.
        """
        if self._sep_token is None and self.verbose:
            print("Using sep_token, but it is not set yet.")
            #logger.error("Using sep_token, but it is not set yet.")
            return None
        return str(self._sep_token)

    @property
    def pad_token(self) -> str:
        """
        `str`: Padding token. Log an error if used while not having been set.
        """
        if self._pad_token is None and self.verbose:
            #logger.error("Using pad_token, but it is not set yet.")
            print("Using pad_token, but it is not set yet.")
            return None
        return str(self._pad_token)

    @property
    def cls_token(self) -> str:
        """
        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
        depth of the model. Log an error if used while not having been set.
        """
        if self._cls_token is None and self.verbose:
            #logger.error("Using cls_token, but it is not set yet.")
            print("Using cls_token, but it is not set yet.")
            return None
        return str(self._cls_token)

    @property
    def mask_token(self) -> str:
        """
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.
        """
        if self._mask_token is None and self.verbose:
            #logger.error("Using mask_token, but it is not set yet.")
            print("Using mask_token, but it is not set yet.")
            return None
        return str(self._mask_token)

    @property
    def additional_special_tokens(self) -> List[str]:
        """
        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
        set.
        """
        if self._additional_special_tokens is None and self.verbose:
            #logger.error("Using additional_special_tokens, but it is not set yet.")
            print("Using additional_special_tokens, but it is not set yet.")
            return None
        return [str(tok) for tok in self._additional_special_tokens]

    @bos_token.setter
    def bos_token(self, value):
        self._bos_token = value

    @eos_token.setter
    def eos_token(self, value):
        self._eos_token = value

    @unk_token.setter
    def unk_token(self, value):
        self._unk_token = value

    @sep_token.setter
    def sep_token(self, value):
        self._sep_token = value

    @pad_token.setter
    def pad_token(self, value):
        self._pad_token = value

    @cls_token.setter
    def cls_token(self, value):
        self._cls_token = value

    @mask_token.setter
    def mask_token(self, value):
        self._mask_token = value

    @additional_special_tokens.setter
    def additional_special_tokens(self, value):
        self._additional_special_tokens = value

    @property
    def bos_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
        been set.
        """
        if self._bos_token is None:
            return None
        return self.convert_tokens_to_ids(self.bos_token)

    @property
    def eos_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
        set.
        """
        if self._eos_token is None:
            return None
        return self.convert_tokens_to_ids(self.eos_token)

    @property
    def unk_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
        """
        if self._unk_token is None:
            return None
        return self.convert_tokens_to_ids(self.unk_token)

    @property
    def sep_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
        sequence. Returns `None` if the token has not been set.
        """
        if self._sep_token is None:
            return None
        return self.convert_tokens_to_ids(self.sep_token)

    @property
    def pad_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
        """
        if self._pad_token is None:
            return None
        return self.convert_tokens_to_ids(self.pad_token)

    @property
    def pad_token_type_id(self) -> int:
        """
        `int`: Id of the padding token type in the vocabulary.
        """
        return self._pad_token_type_id

    @property
    def cls_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
        leveraging self-attention along the full depth of the model.

        Returns `None` if the token has not been set.
        """
        if self._cls_token is None:
            return None
        return self.convert_tokens_to_ids(self.cls_token)

    @property
    def mask_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
        modeling. Returns `None` if the token has not been set.
        """
        if self._mask_token is None:
            return None
        return self.convert_tokens_to_ids(self.mask_token)

    @property
    def additional_special_tokens_ids(self) -> List[int]:
        """
        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
        been set.
        """
        return self.convert_tokens_to_ids(self.additional_special_tokens)

    @bos_token_id.setter
    def bos_token_id(self, value):
        self._bos_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @eos_token_id.setter
    def eos_token_id(self, value):
        self._eos_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @unk_token_id.setter
    def unk_token_id(self, value):
        self._unk_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @sep_token_id.setter
    def sep_token_id(self, value):
        self._sep_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @pad_token_id.setter
    def pad_token_id(self, value):
        self._pad_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @cls_token_id.setter
    def cls_token_id(self, value):
        self._cls_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @mask_token_id.setter
    def mask_token_id(self, value):
        self._mask_token = self.convert_ids_to_tokens(
            value) if value is not None else None

    @additional_special_tokens_ids.setter
    def additional_special_tokens_ids(self, values):
        self._additional_special_tokens = [
            self.convert_ids_to_tokens(value) for value in values
        ]

    @property
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
        `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
        `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).

        Convert potential tokens of `tokenizers.AddedToken` type to string.
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = (type(attr_value)(
                    str(attr_value_sub) for attr_value_sub in attr_value)
                                  if isinstance(attr_value, (list, tuple)) else
                                  str(attr_value))
        return set_attr

    @property
    def special_tokens_map_extended(self) -> Dict[str, Union[
            str, AddedToken, List[Union[str, AddedToken]]]]:
        """
        `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping
        special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).

        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
        special tokens are tokenized.
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
    def all_special_tokens(self) -> List[str]:
        """
        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.

        Convert tokens of `tokenizers.AddedToken` type to string.
        """
        all_toks = [str(s) for s in self.all_special_tokens_extended]
        return all_toks

    @property
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
        """
        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
        attributes.

        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
        special tokens are tokenized.
        """
        all_toks = []
        set_attr = self.special_tokens_map_extended
        for attr_value in set_attr.values():
            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (
                list, tuple)) else [attr_value])
        all_toks = list(OrderedDict.fromkeys(all_toks))
        return all_toks

    @property
    def all_special_ids(self) -> List[int]:
        """
        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
        """
        all_toks = self.all_special_tokens
        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids


================================================
FILE: ppfleetx/data/transforms/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/data/transforms/preprocess.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from functools import partial
import math
import random
import cv2
import numpy as np
from PIL import Image
from PIL import ImageFilter

from paddle.vision.transforms import functional as F
from paddle.vision.transforms import ColorJitter as PPColorJitter
from paddle.vision.transforms import Grayscale

from ppfleetx.utils.log import logger


class OperatorParamError(ValueError):
    """ OperatorParamError
    """
    pass


class DecodeImage(object):
    """ decode image """

    def __init__(self, to_rgb=True, channel_first=False):
        self.to_rgb = to_rgb
        self.channel_first = channel_first

    def __call__(self, img):
        assert type(img) is bytes and len(
            img) > 0, "invalid input 'img' in DecodeImage"
        data = np.frombuffer(img, dtype='uint8')
        img = cv2.imdecode(data, 1)
        if self.to_rgb:
            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
                img.shape)
            img = img[:, :, ::-1]

        if self.channel_first:
            img = img.transpose((2, 0, 1))

        return img


class UnifiedResize(object):
    def __init__(self, interpolation=None, backend="cv2"):
        _cv2_interp_from_str = {
            'nearest': cv2.INTER_NEAREST,
            'bilinear': cv2.INTER_LINEAR,
            'area': cv2.INTER_AREA,
            'bicubic': cv2.INTER_CUBIC,
            'lanczos': cv2.INTER_LANCZOS4
        }
        _pil_interp_from_str = {
            'nearest': Image.NEAREST,
            'bilinear': Image.BILINEAR,
            'bicubic': Image.BICUBIC,
            'box': Image.BOX,
            'lanczos': Image.LANCZOS,
            'hamming': Image.HAMMING
        }

        def _pil_resize(src, size, resample):
            pil_img = Image.fromarray(src)
            pil_img = pil_img.resize(size, resample)
            return np.asarray(pil_img)

        if backend.lower() == "cv2":
            if isinstance(interpolation, str):
                interpolation = _cv2_interp_from_str[interpolation.lower()]
            # compatible with opencv < version 4.4.0
            elif interpolation is None:
                interpolation = cv2.INTER_LINEAR
            self.resize_func = partial(cv2.resize, interpolation=interpolation)
        elif backend.lower() == "pil":
            if isinstance(interpolation, str):
                interpolation = _pil_interp_from_str[interpolation.lower()]
            self.resize_func = partial(_pil_resize, resample=interpolation)
        else:
            logger.warning(
                f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
            )
            self.resize_func = cv2.resize

    def __call__(self, src, size):
        return self.resize_func(src, size)


class ResizeImage(object):
    """ resize image """

    def __init__(self,
                 size=None,
                 resize_short=None,
                 interpolation=None,
                 backend="cv2"):
        if resize_short is not None and resize_short > 0:
            self.resize_short = resize_short
            self.w = None
            self.h = None
        elif size is not None:
            self.resize_short = None
            self.w = size if type(size) is int else size[0]
            self.h = size if type(size) is int else size[1]
        else:
            raise OperatorParamError("invalid params for ReisizeImage for '\
                'both 'size' and 'resize_short' are None")

        self._resize_func = UnifiedResize(
            interpolation=interpolation, backend=backend)

    def __call__(self, img):
        img_h, img_w = img.shape[:2]
        if self.resize_short is not None:
            percent = float(self.resize_short) / min(img_w, img_h)
            w = int(round(img_w * percent))
            h = int(round(img_h * percent))
        else:
            w = self.w
            h = self.h
        return self._resize_func(img, (w, h))


class CenterCropImage(object):
    """ crop image """

    def __init__(self, size):
        if type(size) is int:
            self.size = (size, size)
        else:
            self.size = size  # (h, w)

    def __call__(self, img):
        w, h = self.size
        img_h, img_w = img.shape[:2]
        w_start = (img_w - w) // 2
        h_start = (img_h - h) // 2

        w_end = w_start + w
        h_end = h_start + h
        return img[h_start:h_end, w_start:w_end, :]


class RandCropImage(object):
    """ random crop image """

    def __init__(self,
                 size,
                 scale=None,
                 ratio=None,
                 interpolation=None,
                 backend="cv2"):
        if type(size) is int:
            self.size = (size, size)  # (h, w)
        else:
            self.size = size

        self.scale = [0.08, 1.0] if scale is None else scale
        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio

        self._resize_func = UnifiedResize(
            interpolation=interpolation, backend=backend)

    def __call__(self, img):
        size = self.size
        scale = self.scale
        ratio = self.ratio

        aspect_ratio = math.sqrt(random.uniform(*ratio))
        w = 1. * aspect_ratio
        h = 1. / aspect_ratio

        img_h, img_w = img.shape[:2]

        bound = min((float(img_w) / img_h) / (w**2),
                    (float(img_h) / img_w) / (h**2))
        scale_max = min(scale[1], bound)
        scale_min = min(scale[0], bound)

        target_area = img_w * img_h * random.uniform(scale_min, scale_max)
        target_size = math.sqrt(target_area)
        w = int(target_size * w)
        h = int(target_size * h)

        i = random.randint(0, img_w - w)
        j = random.randint(0, img_h - h)

        img = img[j:j + h, i:i + w, :]

        return self._resize_func(img, size)


class RandFlipImage(object):
    """ random flip image
        flip_code:
            1: Flipped Horizontally
            0: Flipped Vertically
            -1: Flipped Horizontally & Vertically
    """

    def __init__(self, flip_code=1):
        assert flip_code in [-1, 0, 1
                             ], "flip_code should be a value in [-1, 0, 1]"
        self.flip_code = flip_code

    def __call__(self, img):
        if random.randint(0, 1) == 1:
            return cv2.flip(img, self.flip_code)
        else:
            return img


class NormalizeImage(object):
    """ normalize image such as substract mean, divide std
    """

    def __init__(self,
                 scale=None,
                 mean=None,
                 std=None,
                 order='chw',
                 output_fp16=False,
                 channel_num=3):
        if isinstance(scale, str):
            scale = eval(scale)
        assert channel_num in [
            3, 4
        ], "channel number of input image should be set to 3 or 4."
        self.channel_num = channel_num
        self.output_dtype = 'float16' if output_fp16 else 'float32'
        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
        self.order = order
        mean = mean if mean is not None else [0.485, 0.456, 0.406]
        std = std if std is not None else [0.229, 0.224, 0.225]

        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)
        self.mean = np.array(mean).reshape(shape).astype('float32')
        self.std = np.array(std).reshape(shape).astype('float32')

    def __call__(self, img):
        if isinstance(img, Image.Image):
            img = np.array(img)

        assert isinstance(img,
                          np.ndarray), "invalid input 'img' in NormalizeImage"

        img = (img.astype('float32') * self.scale - self.mean) / self.std

        if self.channel_num == 4:
            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]
            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]
            pad_zeros = np.zeros(
                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(
                    (img_h, img_w, 1))
            img = (np.concatenate(
                (img, pad_zeros), axis=0)
                   if self.order == 'chw' else np.concatenate(
                       (img, pad_zeros), axis=2))
        return img.astype(self.output_dtype)


class ToCHWImage(object):
    """ convert hwc image to chw image
    """

    def __init__(self):
        pass

    def __call__(self, img):
        if isinstance(img, Image.Image):
            img = np.array(img)

        return img.transpose((2, 0, 1))


class ColorJitter(PPColorJitter):
    """ColorJitter.
    """

    def __init__(self, *args, **kwargs):
        self.p = kwargs.pop('p', 1.0)
        super().__init__(*args, **kwargs)

    def __call__(self, img):
        if random.random() < self.p:
            if not isinstance(img, Image.Image):
                img = np.ascontiguousarray(img)
                img = Image.fromarray(img)
            img = super()._apply_image(img)
            if isinstance(img, Image.Image):
                img = np.asarray(img)
        return img


class GaussianBlur(object):
    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""

    def __init__(self, sigma=[.1, 2.], p=1.0):
        self.p = p
        self.sigma = sigma

    def __call__(self, img):
        if random.random() < self.p:
            if not isinstance(img, Image.Image):
                img = np.ascontiguousarray(img)
                img = Image.fromarray(img)
            sigma = random.uniform(self.sigma[0], self.sigma[1])
            img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
            if isinstance(img, Image.Image):
                img = np.asarray(img)
        return img


class Pixels(object):
    def __init__(self, mode="const", mean=[0., 0., 0.]):
        self._mode = mode
        self._mean = mean

    def __call__(self, h=224, w=224, c=3):
        if self._mode == "rand":
            return np.random.normal(size=(1, 1, 3))
        elif self._mode == "pixel":
            return np.random.normal(size=(h, w, c))
        elif self._mode == "const":
            return self._mean
        else:
            raise Exception(
                "Invalid mode in RandomErasing, only support \"const\", \"rand\", \"pixel\""
            )


class RandomErasing(object):
    """RandomErasing.
    This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm.
    """

    def __init__(self,
                 EPSILON=0.5,
                 sl=0.02,
                 sh=0.4,
                 r1=0.3,
                 mean=[0., 0., 0.],
                 attempt=100,
                 use_log_aspect=False,
                 mode='const'):
        self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON
        self.sl = eval(sl) if isinstance(sl, str) else sl
        self.sh = eval(sh) if isinstance(sh, str) else sh
        r1 = eval(r1) if isinstance(r1, str) else r1
        self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else (
            r1, 1 / r1)
        self.use_log_aspect = use_log_aspect
        self.attempt = attempt
        self.get_pixels = Pixels(mode, mean)

    def __call__(self, img):
        if random.random() > self.EPSILON:
            return img

        for _ in range(self.attempt):
            area = img.shape[0] * img.shape[1]

            target_area = random.uniform(self.sl, self.sh) * area
            aspect_ratio = random.uniform(*self.r1)
            if self.use_log_aspect:
                aspect_ratio = math.exp(aspect_ratio)

            h = int(round(math.sqrt(target_area * aspect_ratio)))
            w = int(round(math.sqrt(target_area / aspect_ratio)))

            if w < img.shape[1] and h < img.shape[0]:
                pixels = self.get_pixels(h, w, img.shape[2])
                x1 = random.randint(0, img.shape[0] - h)
                y1 = random.randint(0, img.shape[1] - w)
                if img.shape[2] == 3:
                    img[x1:x1 + h, y1:y1 + w, :] = pixels
                else:
                    img[x1:x1 + h, y1:y1 + w, 0] = pixels[0]
                return img
        return img


class RandomGrayscale(object):
    """Randomly convert image to grayscale with a probability of p (default 0.1).
    Args:
        p (float): probability that image should be converted to grayscale.
    Returns:
        PIL Image: Grayscale version of the input image with probability p and unchanged
        with probability (1-p).
        - If input image is 1 channel: grayscale version is 1 channel
        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
    """

    def __init__(self, p=0.1):
        self.p = p

    def __call__(self, img):
        """
        Args:
            img (PIL Image): Image to be converted to grayscale.
        Returns:
            PIL Image: Randomly grayscaled image.
        """

        flag = False
        if not isinstance(img, Image.Image):
            img = np.ascontiguousarray(img)
            img = Image.fromarray(img)
            flag = True

        num_output_channels = 1 if img.mode == 'L' else 3

        if random.random() < self.p:
            img = F.to_grayscale(img, num_output_channels=num_output_channels)

        if flag:
            img = np.asarray(img)

        return img


================================================
FILE: ppfleetx/data/transforms/utils.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from . import preprocess


def transform(data, ops=[]):
    """ transform """
    for op in ops:
        data = op(data)
    return data


def create_preprocess_operators(params):
    """
    create operators based on the config
    Args:
        params(list): a dict list, used to create some operators
    """
    assert isinstance(params, list), ('operator config should be a list')
    ops = []
    for operator in params:
        assert isinstance(operator,
                          dict) and len(operator) == 1, "yaml format error"
        op_name = list(operator)[0]
        param = {} if operator[op_name] is None else operator[op_name]
        op = getattr(preprocess, op_name)(**param)
        ops.append(op)

    return ops


================================================
FILE: ppfleetx/data/utils/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .batch_collate_fn import *


================================================
FILE: ppfleetx/data/utils/batch_collate_fn.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle

import os
import sys
import numbers
import numpy as np
from dataclasses import dataclass

try:
    from collections.abc import Sequence, Mapping
except:
    from collections import Sequence, Mapping

from ppfleetx.data.sampler import Stack, Tuple


def collate_fn(batch):
    """
    Default batch collating function for :code:`paddle.io.DataLoader`,
    get input data as a list of sample datas, each element in list
    if the data of a sample, and sample data should composed of list,
    dictionary, string, number, numpy array and paddle.Tensor, this
    function will parse input data recursively and stack number,
    numpy array and paddle.Tensor datas as batch datas. e.g. for
    following input data:
    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
    
    
    This default collate function zipped each number and numpy array
    field together and stack each field as the batch field as follows:
    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
    Args:  
        batch(list of sample data): batch should be a list of sample data.
    
    Returns:
        Batched data: batched each number, numpy array and paddle.Tensor
                      in input data.
    """
    sample = batch[0]
    if isinstance(sample, np.ndarray):
        batch = np.stack(batch, axis=0)
        return batch
    elif isinstance(sample, paddle.Tensor):
        return paddle.stack(batch, axis=0)
    elif isinstance(sample, numbers.Number):
        batch = np.array(batch)
        return batch
    elif isinstance(sample, (str, bytes)):
        return batch
    elif isinstance(sample, Mapping):
        return {key: collate_fn([d[key] for d in batch]) for key in sample}
    elif isinstance(sample, Sequence):
        sample_fields_num = len(sample)
        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
            raise RuntimeError(
                "fileds number not same among samples in a batch")
        return [collate_fn(fields) for fields in zip(*batch)]

    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
                    "dict, list, number, but got {}".format(type(sample)))


def default_collate_fn(batch_transform=None):
    if batch_transform is not None:
        # batch_ops = create_preprocess_operators(batch_transform)

        # def inner_collate_fn(batch):
        #     batch = transform(batch, batch_ops)
        #     batch = collate_fn(batch)
        #     return batch

        # return inner_collate_fn
        pass
    else:
        return collate_fn


def gpt_collate_fn(batch):
    return Tuple([Stack() for raw in zip(*batch)])(batch)


class ErnieCollateData():
    def __init__(self, micro_batch_size=1):
        self.micro_batch_size = micro_batch_size

    def generate_data(self, data, stack_fn=Stack()):
        num_fields = len(data[0])
        out = [None] * num_fields
        # 0. input_ids,
        # 1. segment_ids,
        # 2. input_mask,
        # 3. masked_lm_positions,
        # 4. masked_lm_labels,
        # 5. next_sentence_labels
        for i in (0, 1, 2, 5):
            out[i] = stack_fn([x[i] for x in data])
        out[5] = out[5].reshape([-1, 1])
        batch_size, seq_length = out[0].shape
        size = num_mask = sum(len(x[3]) for x in data)
        # masked_lm_positions
        # Organize as a 1D tensor for gather or use gather_nd
        if size % 8 != 0:
            size += 8 - (size % 8)
        out[3] = np.full(size, 0, dtype=np.int32)

        # masked_lm_labels
        out[4] = np.full([size, 1], -1, dtype=np.int64)
        mask_token_num = 0
        for i, x in enumerate(data):
            for j, pos in enumerate(x[3]):
                out[3][mask_token_num] = i * seq_length + pos
                out[4][mask_token_num] = x[4][j]
                mask_token_num += 1
        return out

    def __call__(self, data):
        accumulate_steps = len(data) // self.micro_batch_size
        if accumulate_steps == 1:
            return self.generate_data(data)
        else:
            self.micro_batch_size = len(data) // accumulate_steps
            all_data = [[] for _ in range(6)]
            for acc_step in range(accumulate_steps):
                tmp = self.generate_data(
                    data[acc_step * self.micro_batch_size:(acc_step + 1) *
                         self.micro_batch_size])
                for i in range(6):
                    all_data[i].append(tmp[i])
            return all_data


@dataclass
class DataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs to the longest sequence in the batch.

    Args:
        tokenizer_type (str): The type of tokenizer used for encoding the data.
    """

    def __init__(self,
                 tokenizer_type,
                 padding=True,
                 max_length=None,
                 pad_to_multiple_of=None,
                 return_tensors="pd",
                 return_attention_mask=None):
        from ppfleetx.data.tokenizers import get_ernie_tokenizer
        self.tokenizer = get_ernie_tokenizer(tokenizer_type)
        self.padding = padding
        self.max_length = max_length
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_tensors = return_tensors
        self.return_attention_mask = return_attention_mask

    def __call__(self, features):
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
            return_attention_mask=self.return_attention_mask)
        if "label" in batch:
            batch["labels"] = batch["label"]
            del batch["label"]
        if "label_ids" in batch:
            batch["labels"] = batch["label_ids"]
            del batch["label_ids"]
        return batch


def imagen_collate_fn(samples):
    """ collate for imagen base64 """
    tmp = []
    for i in samples:
        if i and len(i['image']):
            tmp.append(i)
    samples = tmp

    if len(samples) == 0:
        return None

    pad_idx = 0
    text_items = [sample['caption'] for sample in samples]
    image_items = [sample['image'] for sample in samples]
    text_lengths = [len(cap) for cap in text_items]

    bsz = len(text_items)
    text_input = text_items

    image_input = paddle.stack(image_items, axis=0)
    _input = {'images': image_input, 'texts': text_input}
    return _input


================================================
FILE: ppfleetx/distributed/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/distributed/apis/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/distributed/apis/amp.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
from types import MethodType
import numpy as np

import paddle
import paddle.nn as nn
from paddle import _legacy_C_ops
from paddle.fluid.dygraph import to_variable
from paddle.fluid import framework
from paddle.fluid.dygraph import base as imperative_base
from paddle.framework import core

from ppfleetx.distributed.apis import env


class MixPrecisionLayer(nn.Layer):
    def __init__(self, layers, dtype="float16"):
        super().__init__(layers.full_name() + "_mix_precision")

        self._layers = layers
        self._dtype = dtype

        assert self._dtype in ["float16", "bfloat16"]

        for param in self._layers.parameters():
            if not param.stop_gradient and not hasattr(param, "main_grad"):
                setattr(param, "main_grad", None)
                param._register_grad_hook(self._update_main_grad_hook(param))

    def _update_main_grad_hook(self, param):
        """Create the update_main_grad hook for backprop."""

        # Hook used for back-prop and grad-merge.
        @paddle.autograd.no_grad()
        def param_hook(tmp_grad):
            assert param.grad is None, \
                "In main_grad node, param.grad should be None, but find param[{}] has grad.".format(param.name)
            if param.main_grad is None:
                param.main_grad = core.eager.Tensor(
                    value=tmp_grad.cast(paddle.float32).value(),
                    place=tmp_grad.place,
                    name="main_grad@" + param.name, )
            else:
                param.main_grad.add_(tmp_grad.cast(paddle.float32))

            tmp_grad._clear_data()
            return None

        return param_hook

    def forward(self, *inputs, **kwargs):
        outputs = self._layers(*inputs, **kwargs)

        return outputs

    def state_dict(
            self,
            destination=None,
            include_sublayers=True,
            structured_name_prefix="", ):

        return self._layers.state_dict(
            destination=destination,
            include_sublayers=include_sublayers,
            structured_name_prefix=structured_name_prefix, )

    @framework.deprecate_stat_dict
    def set_state_dict(self, state_dict, use_structured_name=True):

        self._layers.set_state_dict(
            state_dict, use_structured_name=use_structured_name)


class MixPrecisionOptimizer:
    def __init__(self, optimizer):
        self._inner_opt = optimizer
        self._parameter_list = self._obtain_optimizer_parameters_list()

    def _obtain_optimizer_parameters_list(self):
        if getattr(self._inner_opt, '_param_groups', None) and isinstance(
                self._inner_opt._param_groups[0], dict):
            parameters_list = []
            for group in self._inner_opt._param_groups:
                for param in group['params']:
                    parameters_list.append(param)
        else:
            parameters_list = [
                param for param in self._inner_opt._parameter_list
            ]

        return parameters_list

    @imperative_base.no_grad
    @framework.dygraph_only
    def step(self):

        if not isinstance(self._parameter_list[0], dict):
            params_grads = []
            for param in self._parameter_list:
                if param.stop_gradient:
                    continue
                grad_var = param.main_grad
                if framework.in_dygraph_mode():
                    if (hasattr(grad_var, "is_selected_rows") and
                            grad_var.is_selected_rows() and
                            self._inner_opt.regularization is not None):
                        raise RuntimeError(
                            "AdamW don't support weight_decay with sparse parameters, please set it to None."
                        )
                else:
                    if (hasattr(grad_var, "_is_sparse") and
                            grad_var._is_sparse() and
                            self._inner_opt.regularization is not None):
                        raise RuntimeError(
                            "AdamW don't support weight_decay with sparse parameters, please set it to None."
                        )
                params_grads.append((param, grad_var))

            optimize_ops = self._inner_opt._apply_optimize(
                loss=None, startup_program=None, params_grads=params_grads)
        else:
            # optimize parameters in groups
            for param_group in self._inner_opt._param_groups:
                params_grads = defaultdict(lambda: list())
                for param in param_group['params']:
                    if param.stop_gradient:
                        continue
                    grad_var = param.main_grad
                    if framework.in_dygraph_mode():
                        if (hasattr(grad_var, "is_selected_rows") and
                                grad_var.is_selected_rows() and
                                self._inner_opt.regularization is not None):
                            raise RuntimeError(
                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
                            )
                    else:
                        if (hasattr(grad_var, "_is_sparse") and
                                grad_var._is_sparse() and
                                self._inner_opt.regularization is not None):
                            raise RuntimeError(
                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
                            )
                    params_grads['params'].append((param, grad_var))
                params_grads.update(
                    {k: v
                     for k, v in param_group.items() if k != 'params'})
                self._apply_optimize(
                    loss=None, startup_program=None, params_grads=params_grads)

    @framework.dygraph_only
    def clear_grad(self, set_to_zero=True):

        param_list = []
        if self._parameter_list is None or not isinstance(
                self._parameter_list[0], dict):
            for p in self._parameter_list:
                if not p.stop_gradient:
                    param_list.append(p)
        else:
            for param_group in self._param_groups:
                for p in param_group['params']:
                    if not p.stop_gradient:
                        param_list.append(p)

        for p in param_list:
            if hasattr(p, "main_grad") and p.main_grad is not None:
                if set_to_zero:
                    p.main_grad.zero_()
                else:
                    p.main_grad._clear()
                    p.main_grad = None
            elif not hasattr(p, "main_grad"):
                p.clear_gradient(set_to_zero)

    def __getattr__(self, item):
        return getattr(self._inner_opt, item)


def unscale_method(self, optimizer):
    if not self._enable:
        return
    param_grads = []
    if getattr(optimizer, '_param_groups', None) and isinstance(
            optimizer._param_groups[0], dict):
        for group in optimizer._param_groups:
            for param in group['params']:
                if param.main_grad is not None:
                    assert param.main_grad.dtype == core.VarDesc.VarType.FP32
                    param_grads.append(param.main_grad)
    else:
        for param in optimizer._parameter_list:
            if param.main_grad is not None:
                assert param.main_grad.dtype == core.VarDesc.VarType.FP32
                param_grads.append(param.main_grad)

    temp_found_inf = to_variable(np.array([0]).astype(np.bool_))
    if len(param_grads):
        _legacy_C_ops.check_finite_and_unscale(
            param_grads,
            self._scale,
            param_grads,
            temp_found_inf, )

    self._found_inf = 1 if temp_found_inf else 0

    hcg = env.get_hcg()
    if hcg is not None and hcg.nranks > hcg.get_data_parallel_world_size():
        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
        paddle.distributed.all_reduce(
            is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
        self._found_inf = is_found_inf.numpy()[0]


class MixPrecisionScaler:
    def __init__(self, scaler):
        self._inner_scaler = scaler
        self._inner_scaler._unscale = MethodType(unscale_method, scaler)

    def __getattr__(self, item):
        return getattr(self._inner_scaler, item)


================================================
FILE: ppfleetx/distributed/apis/comm_groups.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle.distributed.fleet.base.strategy_group import (
    StrategyGroupBase,
    DPGroup,
    MPGroup,
    PPGroup,
    ShardingGroup, )
from paddle.distributed.fleet.base.orthogonal_strategy import OrthogonalStrategy


def create_hcg(strategy, hcg_name):
    if hcg_name == "HybridCommunicateGroup":
        fleet.init(is_collective=True, strategy=strategy)
        hcg = fleet.get_hybrid_communicate_group()
    else:
        dist.init_parallel_env()
        hcg = eval("{}".format(hcg_name))(strategy)

    return hcg


class MoEGroup(StrategyGroupBase):
    """
    The communication group strategy for expert parallel.
    Args:
        list_of_ranks: A 2D-array, such as `[[0, 1, 2, 3], [4, 5, 6, 7]]`. Ranks in sublist represents
    they are in the same communication group.
    Returns:
        The instance of expert parallel strategy group.
    """

    def __init__(self, list_of_ranks):
        super(MoEGroup, self).__init__(list_of_ranks)
        assert not isinstance(
            self.group,
            list), "Rank {} belongs to multi moe groups".format(self._rank)


class Hybrid4DCommGroup(OrthogonalStrategy):
    def __init__(self, list_of_strategy=None, fused_strategy_dict={}):
        list_of_strategy = [
            ("dp", 1, DPGroup),
            ("mp", 1, MPGroup),
            ("pp", 1, PPGroup),
            ("sharding", 1, ShardingGroup),
        ] if list_of_strategy is None else list_of_strategy

        fused_strategy_dict["check"] = ["mp", "pp"]

        super().__init__(list_of_strategy, fused_strategy_dict)

    # data parallel
    def get_data_parallel_rank(self):
        return self.rank_in_strategy("dp")

    def get_data_parallel_world_size(self):
        return self.strategy_group("dp").world_size

    def get_data_parallel_group(self):
        return self.strategy_group("dp").group

    def get_data_parallel_group_src_rank(self):
        return self.strategy_group("dp").group.ranks[0]

    # tensor parallel
    def get_model_parallel_rank(self):
        return self.rank_in_strategy("mp")

    def get_model_parallel_world_size(self):
        return self.strategy_group("mp").world_size

    def get_model_parallel_group(self):
        return self.strategy_group("mp").group

    def get_model_parallel_group_src_rank(self):
        return self.strategy_group("mp").group.ranks[0]

    # pipeline parallel
    def get_stage_id(self):
        return self.rank_in_strategy("pp")

    def get_pipe_parallel_world_size(self):
        return self.strategy_group("pp").world_size

    def get_pipe_parallel_group(self):
        return self.strategy_group("pp").group

    def get_p2p_groups(self):
        return (self.strategy_group("pp").p2p_groups)

    # group sharded parallel
    def get_sharding_parallel_rank(self):
        return self.rank_in_strategy("sharding")

    def get_sharding_parallel_world_size(self):
        return self.strategy_group("sharding").world_size

    def get_sharding_parallel_group(self):
        return self.strategy_group("sharding")

    def get_sharding_parallel_group_src_rank(self):
        return self.strategy_group("sharding").ranks[0]

    # check parallel group
    def get_check_parallel_group(self):
        return self.strategy_group("check").group


class HybridCommGroupForMoE(Hybrid4DCommGroup):
    def __init__(self, strategy):
        self._dp_degree = strategy.hybrid_configs.get("dp_degree", 1)
        self._mp_degree = strategy.hybrid_configs.get("mp_degree", 1)
        self._pp_degree = strategy.hybrid_configs.get("pp_degree", 1)
        self._sharding_degree = strategy.hybrid_configs.get("sharding_degree",
                                                            1)

        assert self._pp_degree == 1, "The strategy combination of moe and pp \
            has not been supported in ppfleetx right now."

        assert self._sharding_degree == 1, "The strategy combination of moe and sharding \
            has not been supported in ppfleetx right now."

        list_of_strategy = [
            ("dp", self._dp_degree, DPGroup),
            ("mp", self._mp_degree, MPGroup),
            ("pp", self._pp_degree, PPGroup),
            ("sharding", self._sharding_degree, ShardingGroup),
        ]
        fused_strategy_dict = {"moe": ["dp", "mp"]}

        super().__init__(list_of_strategy, fused_strategy_dict)

    def get_expert_parallel_world_size(self):
        return self.fused_strategy_group("moe").world_size

    def get_expert_parallel_group(self):
        return self.fused_strategy_group("moe").group


================================================
FILE: ppfleetx/distributed/apis/env.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random
import numpy as np

import paddle
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker

from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import comm_groups

__all__ = ['init_dist_env']

_seed = None
_dp_seed = None
_hcg = None


def set_seed(seed):
    # NOTE(shenliang03): For parameter init seed:
    # seed: dp/mp_undistributed_paramter/sharding is same; others is different
    # For compute seed(dropout):
    # global seed: only mp group is same.
    # local seed: all groups are different

    if dist.get_world_size() > 1:
        # obtain rank message of hybrid parallel
        hcg = get_hcg()

        mp_rank = hcg.get_model_parallel_rank()
        mp_size = hcg.get_model_parallel_world_size()

        pp_rank = hcg.get_stage_id()
        pp_size = hcg.get_pipe_parallel_world_size()

        dp_rank = hcg.get_data_parallel_rank()
        dp_size = hcg.get_data_parallel_world_size()

        sharding_rank = hcg.get_sharding_parallel_rank()
        sharding_size = hcg.get_sharding_parallel_world_size()
    else:
        mp_rank, mp_size = 0, 1
        pp_rank, pp_size = 0, 1
        dp_rank, dp_size = 0, 1
        sharding_rank, sharding_size = 0, 1

    # NOTE: the commented seeds are set only for precision validation
    # seed += 100 * pp_rank
    random.seed(seed + 100 * pp_rank)
    np.random.seed(seed + 100 * pp_rank)

    # seed = mp_rank + 
    #        pp_rank * (mp_size) + 
    #        dp_rank * (mp_size * pp_size) + 
    #        sharding_rank * (mp_size * pp_size * dp_size)
    # seed offset is order to avoid conflicts with the parameter initialization seed

    seed_offset = seed + 1024 + paddle.distributed.get_world_size()
    global_seed = seed_offset + \
                  pp_rank * (mp_size) + \
                  dp_rank * (mp_size * pp_size) + \
                  sharding_rank * (mp_size * pp_size * dp_size)

    seed_offset += paddle.distributed.get_world_size()
    local_seed = seed_offset + \
                 mp_rank + \
                 pp_rank * (mp_size) + \
                 dp_rank * (mp_size * pp_size) + \
                 sharding_rank * (mp_size * pp_size * dp_size)

    tracker = get_rng_state_tracker()
    tracker.add('global_seed', global_seed)
    tracker.add('local_seed', local_seed)

    paddle.seed(global_seed)

    logger.info("The global seed is set to {} and local seed is set to {}.".
                format(global_seed, local_seed))

    global _seed
    global _dp_seed
    _seed = seed
    _dp_seed = global_seed


def set_hcg(hcg):
    global _hcg
    _hcg = hcg


def get_hcg():
    global _hcg
    return _hcg


def get_seed():
    global _seed
    return _seed


def get_dp_seed():
    global _dp_seed
    return _dp_seed


def init_dist_env(config):
    paddle.set_device(config.Global.device)

    strategy = fleet.DistributedStrategy()
    strategy.hybrid_configs = {
        "dp_degree": config.Distributed.dp_degree,
        "mp_degree": config.Distributed.mp_degree,
        "pp_degree": config.Distributed.pp_degree,
        "sharding_degree": config.Distributed.sharding.sharding_degree,
    }

    if config.Distributed.pp_degree > 1:
        if 'sequence_parallel' in config.Model:
            if config.Model.sequence_parallel:
                assert config.Global.enable_partial_send_recv is False, \
                    "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \
                    "config.Global.enable_partial_send_recv should be set False."

    strategy.pipeline_configs = {
        "accumulate_steps":
        config.Global.local_batch_size // config.Global.micro_batch_size,
        "micro_batch_size": config.Global.micro_batch_size,
        "enable_partial_send_recv": config.Global.enable_partial_send_recv,
    }

    # set control in tensor parallel
    seed = config.Global.seed
    strategy.tensor_parallel_configs = {"tensor_init_seed": seed}

    hcg = comm_groups.create_hcg(strategy, hcg_name=config.Distributed.hcg)
    set_hcg(hcg)


def get_local_rank():
    return int(os.getenv("PADDLE_RANK_IN_NODE", 0))


def get_data_world_size():
    if paddle.distributed.get_world_size() == 1:
        return 1

    hcg = get_hcg()
    dp_size = hcg.get_data_parallel_world_size()
    sharding_size = hcg.get_sharding_parallel_world_size()

    return dp_size * sharding_size


def get_data_world_rank():
    if paddle.distributed.get_world_size() == 1:
        return 0

    hcg = get_hcg()
    dp_rank = hcg.get_data_parallel_rank()
    sharding_rank = hcg.get_sharding_parallel_rank()
    sharding_size = hcg.get_sharding_parallel_world_size()

    return dp_rank * sharding_size + sharding_rank


def work_at_local_rank0(func):
    def wrapper(*args, **kwargs):
        local_rank = 0
        if paddle.fluid.core.is_compiled_with_dist(
        ) and paddle.distributed.get_world_size() > 1:
            local_rank = paddle.distributed.ParallelEnv().dev_id
        if local_rank == 0:
            func(*args, **kwargs)

    return wrapper


================================================
FILE: ppfleetx/distributed/apis/io.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random
import numpy as np

import paddle
import paddle.distributed as dist
from paddle.distributed import fleet
from paddle.incubate.distributed.utils.io import save_for_auto_inference

from ppfleetx.utils.log import logger
from ppfleetx.distributed.apis import env


def save(output_dir, model, optimizer=None, step=0, epoch=0, sharding_stage=2):
    """
    save the state dicts of model and optimizer into an checkpoint.
    """

    nranks = dist.get_world_size()
    if nranks > 1:
        hcg = env.get_hcg()

        dp_rank = hcg.get_data_parallel_rank()
        mp_rank = hcg.get_model_parallel_rank()
        pp_rank = hcg.get_stage_id()
        sharding_rank = hcg.get_sharding_parallel_rank()
    else:
        dp_rank = 0

    if dp_rank != 0:
        logger.info("DP_Rank %d doesn't save model" % dp_rank)
        return

    if output_dir and isinstance(output_dir, str):
        output_dir = os.path.join(output_dir,
                                  "epoch_%d_step_%d" % (epoch, step))

        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        logger.info("Save model to %s" % output_dir)

        save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format(
            output_dir, mp_rank, sharding_rank,
            pp_rank) if nranks > 1 else output_dir

        if sharding_stage == 3:
            model.get_all_parameters(convert2cpu=False)

        paddle.save(model.state_dict(),
                    os.path.join(save_dir, "model.pdparams"))

        if optimizer is not None:
            paddle.save(optimizer.state_dict(),
                        os.path.join(save_dir, "model_state.pdopt"))

        meta_dict = {
            "epoch": epoch,
            "step": step,
            "cuda_rng_state": paddle.get_cuda_rng_state()
        }
        paddle.save(meta_dict, os.path.join(save_dir, "meta_state.pdopt"))

        save_auto_dir = os.path.join(output_dir, "auto_infer")
        save_for_auto_inference(os.path.join(save_auto_dir, "auto"), model)

    else:
        raise TypeError("`save` requires a valid value of `output_dir`.")


def load(ckpt_dir, model, optimizer=None, mode='train', load_recovery=None):
    nranks = dist.get_world_size()
    if nranks > 1:
        hcg = env.get_hcg()

        dp_rank = hcg.get_data_parallel_rank()
        mp_rank = hcg.get_model_parallel_rank()
        pp_rank = hcg.get_stage_id()
        sharding_rank = hcg.get_sharding_parallel_rank()
    else:
        dp_rank = 0

    load_recovery = {} if load_recovery is None else load_recovery

    if ckpt_dir and isinstance(ckpt_dir, str):
        logger.info("Try to load checkpoint from %s " % ckpt_dir)

        if mode == 'quant':
            load_dir = ckpt_dir
        else:
            load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format(
                ckpt_dir, mp_rank, sharding_rank,
                pp_rank) if nranks > 1 else ckpt_dir
        model_path = os.path.join(load_dir, "model.pdparams")
        opt_path = os.path.join(load_dir, "model_state.pdopt")
        meta_path = os.path.join(load_dir, "meta_state.pdopt")

        if os.path.exists(model_path):
            model_dict = paddle.load(model_path)
            for name, param in model.state_dict().items():
                assert name in model_dict.keys(
                ), "No param named `{}` was found in checkpoint file.".format(
                    name)

                if param.dtype != model_dict[name].dtype:
                    model_dict[name] = model_dict[name].cast(param.dtype)

            model.set_state_dict(model_dict)
        else:
            raise ValueError("No model checkpoint file found in %s." %
                             model_path)

        if mode == 'train':
            if os.path.exists(opt_path):
                opt_dict = paddle.load(opt_path)
                optimizer.set_state_dict(opt_dict)
            else:
                raise ValueError("No optimizer checkpoint file found in %s." %
                                 opt_path)

            if os.path.exists(meta_path):
                meta_dict = paddle.load(meta_path)

                load_recovery.update({
                    'step': meta_dict['step'],
                    'epoch': meta_dict['epoch'],
                    'rng_state': meta_dict['cuda_rng_state']
                })

            else:
                raise ValueError("No meta checkpoint file found in %s." %
                                 meta_path)

        logger.info("successfully load checkpoints")
    else:
        logger.warning("`load` requires a valid value of `ckpt_dir`.")
        raise TypeError("`load` requires a valid value of `ckpt_dir`.")


================================================
FILE: ppfleetx/distributed/apis/strategy.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.distributed as dist
import paddle.distributed.fleet as fleet

from paddle.distributed.parallel import sync_params_buffers
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
from paddle.distributed.fleet.meta_parallel import TensorParallel
from paddle.distributed.sharding import group_sharded_parallel

from ppfleetx.distributed.apis import env, amp
from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters


def wrap_with_fleet(dist_config, model, optimizer=None, scaler=None):
    if dist_config.sharding.sharding_stage in [2, 3]:
        assert dist_config.pp_degree == 1, \
            "sharding stage2/3 will support pipeline parallel later"
        return wrap_sharding_2_3(dist_config, model, optimizer, scaler)
    else:
        return wrap_3D_parallel(dist_config, model, optimizer, scaler)


def wrap_sharding_2_3(dist_config, model, optimizer=None, scaler=None):
    hcg = env.get_hcg()
    dp_group = hcg.get_data_parallel_group()
    sharding_group = hcg.get_sharding_parallel_group()

    if dist_config.dp_degree > 1 and dist_config.sharding.sharding_stage == 3:
        sync_params_buffers(
            model, comm_group=dp_group, src_rank=dp_group.ranks[0])

    if dist_config.mp_degree > 1:
        assert dist_config.sharding.sharding_stage == 2, "only support mp + sharding stage2 hybrid parallel now."
        model = TensorParallel(model, hcg, strategy=None)

    level = "p_g_os" if dist_config.sharding.sharding_stage == 3 else "os_g"
    origin_model = model
    model, optimizer, scaler = group_sharded_parallel(
        model=model,
        optimizer=optimizer,
        level=level,
        scaler=scaler,
        group=sharding_group,
        offload=dist_config.sharding.sharding_offload,
        dp_group=dp_group if dp_group.nranks > 1 else None)

    if dist_config.sharding.reduce_overlap:
        model._set_reduce_overlap(dist_config.sharding.reduce_overlap)

    if dist_config.sharding.broadcast_overlap:
        optimizer._set_broadcast_overlap(
            dist_config.sharding.broadcast_overlap,
            layers=origin_model,
            num_groups=2)

    return model, optimizer, scaler


def wrap_3D_parallel(dist_config, model, optimizer=None, scaler=None):
    hcg = env.get_hcg()
    dp_group = hcg.get_data_parallel_group()

    if isinstance(model, amp.MixPrecisionLayer):
        if dist.get_world_size() == dist_config.dp_degree:
            sync_params_buffers(
                model, comm_group=dp_group, src_rank=dp_group.ranks[0])
        elif dist_config.pp_degree > 1:
            model = fleet.distributed_model(model._layers)
    else:
        model = fleet.distributed_model(model)

    optimizer = fleet.distributed_optimizer(
        optimizer) if optimizer is not None else optimizer
    scaler = fleet.distributed_scaler(scaler) if scaler is not None else scaler

    return model, optimizer, scaler


================================================
FILE: ppfleetx/distributed/protein_folding/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and 
# limitations under the License.

from . scg import scg


================================================
FILE: ppfleetx/distributed/protein_folding/bp.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Branch Parallel helper function"""

import paddle
from paddle.autograd import PyLayer
from . import scg

__all__ = [
    'get_world_size',
    'get_rank_in_group',
    ]

def get_world_size():
    nranks = 1
    if hasattr(scg, "bp_group"):
        nranks = scg.bp_group.nranks
    return nranks


def get_rank_in_group():
    rank = 0
    if hasattr(scg, "get_rank_in_bp_group"):
        rank = scg.get_rank_in_bp_group()
    return rank

@paddle.no_grad()
def broadcast(tensor, src):
    """ broadcast tensor from src rank in bp group """
    if get_world_size() == 1:
        return tensor
  
    assert src in [0, 1], "Branch Parallel is only support bp_degree=2 now!"
  
    group = scg.bp_group
    task = group.process_group.broadcast(tensor, src)
    task.wait()
    return tensor

class BroadcastGrad(PyLayer):
    """ A PyLayer Op broadcast gradient in backward stage """
    @staticmethod
    def forward(ctx, input, src):
        """ return input directly """ 
        ctx.src = src
        return input.clone()

    @staticmethod
    def backward(ctx, grad_output):
        """ broadcast grad form src """ 
        broadcast(grad_output, ctx.src)
        return grad_output.clone()

def broadcast_grad_for_backward(input, src):
    """ a warpper for boradcast gradient in backward stage """
    if get_world_size() == 1:
        return input

    if not input.stop_gradient:
        output = BroadcastGrad.apply(input, src)
    else:
        output = input.clone()
    return output

@paddle.no_grad()
def all_reduce(tensor):
    """ allreduce a tensor in bp group """
    if get_world_size() == 1:
        return tensor

    group = scg.bp_group
    paddle.distributed.all_reduce(
        tensor, sync_op=True, group=group)

    return tensor


class SyncEvoformerResults(PyLayer):
    """ A PyLayer Op broadcast gradient in backward stage """
    @staticmethod
    def forward(ctx, outer, msa, pair):
        broadcast(outer, 0)
        if get_rank_in_group() == 1:
            pair += outer
        broadcast(pair, 1)
        broadcast(msa, 0)
        return msa, pair

    @staticmethod
    def backward(ctx, *grad_output):
        msa_grad = grad_output[0]
        pair_grad = grad_output[1]

        if get_rank_in_group() == 0:
            pair_grad = paddle.zeros_like(pair_grad)

        outer_grad = pair_grad.clone()
        broadcast(outer_grad, 1)
        
        return outer_grad, msa_grad, pair_grad

def sync_evoformer_results(outer, msa, pair):
    """ a warpper for boradcast gradient in backward stage """
    if get_world_size() == 1:
        return msa, pair

    if outer.stop_gradient and msa.stop_gradient and pair.stop_gradient:
        return msa, pair

    msa, pair = SyncEvoformerResults.apply(outer, msa, pair)
        
    return msa, pair

@paddle.no_grad()
def grad_sync(param_groups):
    """
        sync the gradients of params
    """

    nranks = get_world_size()

    if nranks < 2:
        return

    comm_group = scg.bp_group

    for group in param_groups:
        if group.get("bp", False):
            for p in group['params']:
                if p.is_distributed:
                    continue

                grad = p.grad
                if grad is None:
                    continue

                paddle.distributed.all_reduce(
                    grad, sync_op=True, group=comm_group)

    return None


================================================
FILE: ppfleetx/distributed/protein_folding/dap.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Dynamic Axial Parallelism and Duality Async Operation helper functions
paper ref: FastFold: Reducing AlphaFold Training Time from 11 Days to 67 Hours, https://arxiv.org/abs/2203.00854
code ref: https://github.com/hpcaitech/FastFold.git
"""

import warnings
import time
import paddle
from paddle import nn
from paddle import distributed as dist
from paddle.autograd import PyLayer
from . import scg

__all__ = [
    'set_dap_sync_op', 'get_dap_sync_op', 'get_world_size',
    'get_rank_in_group', 'scatter', 'gather', 'all_gather', 'all_gather_opp',
    'all_to_all', 'all_to_all_opp', 'row_to_col', 'col_to_row'
]

_sync_op = True


def set_dap_sync_op(sync_op):
    assert sync_op in [True, False]
    assert sync_op is True, "Only support sync mode now!"
    global _sync_op
    _sync_op = sync_op


def get_dap_sync_op():
    global _sync_op
    return _sync_op


def get_world_size():
    nranks = 1
    if hasattr(scg, "dap_group"):
        nranks = scg.dap_group.nranks
    return nranks


def get_rank_in_group():
    rank = 0
    if hasattr(scg, "get_rank_in_dap_group"):
        rank = scg.get_rank_in_dap_group()
    return rank


def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
        numerator, denominator)


def divide(numerator, denominator):
    ensure_divisibility(numerator, denominator)
    return numerator // denominator


@paddle.no_grad()
def _all_gather(tensor, axis=-1, sync_op=True):
    group = scg.dap_group
    tensor_shape = list(tensor.shape)
    tensor_shape[0] *= group.nranks
    out = paddle.zeros(tensor_shape, tensor.dtype)
    out.stop_gradient = tensor.stop_gradient
    task = group.process_group.all_gather(tensor, out)
    task.wait()
    return out


@paddle.no_grad()
def _gather(tensor, axis=-1):
    output = _all_gather(tensor)
    if axis != 0:
        output = paddle.concat(
            paddle.split(
                output, get_world_size(), axis=0), axis=axis)
    return output


@paddle.no_grad()
def _split(tensor, axis=-1):
    ensure_divisibility(tensor.shape[axis], get_world_size())
    tensor_list = paddle.split(tensor, get_world_size(), axis=axis)

    output = tensor_list[get_rank_in_group()]

    return output


class Scatter(PyLayer):
    """ Scatter PyLayer Op"""

    @staticmethod
    def forward(ctx, input, axis: -1):
        ctx.axis = axis
        return _split(input, axis=axis)

    @staticmethod
    def backward(ctx, grad_output):
        return _gather(grad_output, axis=ctx.axis)


def scatter(input, axis=-1):
    """ split a tensor according axis by dap size """
    if get_world_size() == 1:
        return input

    if not input.stop_gradient:
        output = Scatter.apply(input, axis=axis)
    else:
        output = _split(input, axis=axis)
    return output


class Gather(PyLayer):
    """ Gather PyLayer Op """

    @staticmethod
    def forward(ctx, input, axis=-1):
        ctx.axis = axis
        return _gather(input, axis=axis)

    @staticmethod
    def backward(ctx, grad_output):
        return _split(grad_output, axis=ctx.axis)


def gather(input, axis=-1):
    """ gather tensor form all rank in dap group in axis """
    if get_world_size() == 1:
        return input

    if not input.stop_gradient:
        output = Gather.apply(input, axis=axis)
    else:
        output = _gather(input, axis=axis)
    return output


@paddle.no_grad()
def _reduce_scatter(tensor, sync_op=True):
    group = scg.dap_group
    tensor_shape = list(tensor.shape)
    tensor_shape[0] = divide(tensor_shape[0], group.nranks)
    output = paddle.zeros(tensor_shape, tensor.dtype)
    output.stop_gradient = tensor.stop_gradient
    dist.stream.reduce_scatter(
        output, tensor, op=dist.ReduceOp.SUM, group=group, sync_op=True)
    return output


class AllGather(PyLayer):
    """ AllGather PyLayer Op """

    @staticmethod
    def forward(ctx, input, axis=-1, sync_op=True):
        ctx.axis = axis
        ctx.sync_op = sync_op
        output = _all_gather(input, axis=axis, sync_op=sync_op)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        if not ctx.sync_op:
            pass
            # TODO(GuoxiaWang): implement wait logical
        return grad_output


class AllGather_Opp(PyLayer):
    """ Duality Async Operation for AllGather """

    @staticmethod
    def forward(ctx, input, axis=-1, sync_op=True):
        ctx.axis = axis
        ctx.sync_op = sync_op
        return input

    @staticmethod
    def backward(ctx, grad_output):
        output = _reduce_scatter(grad_output, sync_op=ctx.sync_op)
        return output


def all_gather(input, axis=-1):
    """ gather tensors from all rank in dap group and all get the result.
        if sync_op=None, sync will be assign according init_dap setting.

        when using async communication, sync_op=False, do not use the output as same as input.
        E.g. do not use `a = all_gather(a, ...)`, recommend to use `b = all_gather(a, ...)`
    """
    if get_world_size() == 1:
        return input

    sync_op = get_dap_sync_op()

    if not input.stop_gradient:
        output = AllGather.apply(input, axis, sync_op=sync_op)
    else:
        output = _all_gather(input, axis, sync_op=sync_op)
    return output


def all_gather_opp(output, axis=-1):
    """ Duality Async Operation for all_gather.
        if sync_op=None, sync will be assign according init_dap setting.
    """
    nranks = get_world_size()
    if nranks == 1:
        return output

    sync_op = get_dap_sync_op()

    if not sync_op:
        # TODO(GuoxiaWang): implement wait logical
        pass

    if not output.stop_gradient:
        output = AllGather_Opp.apply(output, axis, sync_op=sync_op)

    if axis != 0:
        output = paddle.concat(paddle.split(output, nranks, 0), axis=axis)

    return output


@paddle.no_grad()
def _all_to_all(tensor, in_axis=-1, out_axis=-1, sync_op=True):
    group = scg.dap_group
    tensor_shape = list(tensor.shape)

    out = paddle.zeros(tensor_shape, tensor.dtype)
    out.stop_gradient = tensor.stop_gradient
    task = group.process_group.alltoall(tensor, out)
    task.wait()

    return out


class All_to_All(PyLayer):
    """ All_to_All PyLayer Op"""

    @staticmethod
    def forward(ctx, input, in_axis=-1, out_axis=-1, sync_op=True):
        ctx.in_axis = in_axis
        ctx.out_axis = out_axis
        ctx.sync_op = sync_op
        return _all_to_all(
            input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)

    @staticmethod
    def backward(ctx, grad_output):
        if not ctx.sync_op:
            # TODO(GuoxiaWang): implement wait logical
            pass
        return grad_output


class All_to_All_Opp(PyLayer):
    """ Duality Async Operation for All_to_All """

    @staticmethod
    def forward(ctx, output, in_axis=-1, out_axis=-1, sync_op=True):
        ctx.in_axis = in_axis
        ctx.out_axis = out_axis
        ctx.sync_op = sync_op
        return output

    @staticmethod
    def backward(ctx, grad_output):
        return _all_to_all(
            grad_output,
            in_axis=ctx.out_axis,
            out_axis=ctx.in_axis,
            sync_op=ctx.sync_op)


def all_to_all(input, in_axis, out_axis):
    """ all to all according in_axis and out_axis.
        if sync_op=None, sync will be assign according init_dap setting.
    """
    if get_world_size() == 1:
        return input

    sync_op = get_dap_sync_op()

    if in_axis != 0:
        ensure_divisibility(input.shape[in_axis], get_world_size())
        input = paddle.concat(
            paddle.split(
                input, get_world_size(), axis=in_axis), axis=0)

    if not input.stop_gradient:
        output = All_to_All.apply(
            input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)
    else:
        output = _all_to_all(
            input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)

    return output


def all_to_all_opp(output, in_axis, out_axis):
    """ Duality Async Operation for all_to_all.
        if sync_op=None, sync will be assign according init_dap setting.
    """
    if get_world_size() == 1:
        return output

    sync_op = get_dap_sync_op()

    if not sync_op:
        # TODO(GuoxiaWang): implement wait logical
        pass

    if not output.stop_gradient:
        output = All_to_All_Opp.apply(
            output, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op)

    if out_axis != 0:
        ensure_divisibility(output.shape[0], get_world_size())
        output = paddle.concat(
            paddle.split(
                output, get_world_size(), axis=0), axis=out_axis)

    return output


class All2All(PyLayer):
    @staticmethod
    def forward(ctx, input, in_axis=-1, out_axis=-1):
        ctx.in_axis = in_axis
        ctx.out_axis = out_axis
        return _all_to_all(input, in_axis=in_axis, out_axis=out_axis)

    @staticmethod
    def backward(ctx, grad_output):
        return _all_to_all(
            grad_output, in_axis=ctx.out_axis, out_axis=ctx.in_axis)


def row_to_col(input):
    """ N, S, R, C => N, R, S, C using sync all_to_all """
    if get_world_size() == 1:
        return input

    ensure_divisibility(input.shape[2], get_world_size())
    input = paddle.concat(
        paddle.split(
            input, get_world_size(), axis=2), axis=0)

    if not input.stop_gradient:
        output = All2All.apply(input, in_axis=2, out_axis=1)
    else:
        output = _all_to_all(input, in_axis=2, out_axis=1)

    output = paddle.concat(
        paddle.split(
            output, get_world_size(), axis=0), axis=1)
    return output


def col_to_row(input):
    """ N, R, S, C => N, S, R, C using sync all_to_all """
    if get_world_size() == 1:
        return input

    ensure_divisibility(input.shape[1], get_world_size())
    input = paddle.concat(
        paddle.split(
            input, get_world_size(), axis=1), axis=0)

    if not input.stop_gradient:
        output = All2All.apply(input, in_axis=1, out_axis=2)
    else:
        output = _all_to_all(input, in_axis=1, out_axis=2)

    output = paddle.concat(
        paddle.split(
            output, get_world_size(), axis=0), axis=2)
    return output


@paddle.no_grad()
def grad_sync(param_groups):
    """
        sync the gradients of params
    """

    nranks = get_world_size()

    if nranks < 2:
        return

    comm_group = scg.dap_group

    for group in param_groups:
        if group.get("dap", False):
            for p in group['params']:
                if p.is_distributed:
                    continue

                grad = p.grad
                if grad is None:
                    continue

                paddle.distributed.all_reduce(
                    grad, sync_op=True, group=comm_group)

    return None


================================================
FILE: ppfleetx/distributed/protein_folding/dp.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Distributed Data Parallel helper functions
"""

import paddle
from . import scg

__all__ = [
    'get_world_size',
    'get_rank_in_group',
    'grad_sync',
    'param_sync'
    ]

def get_world_size():
    nranks = 1
    if hasattr(scg, "dp_group"):
        nranks = scg.dp_group.nranks
    return nranks


def get_rank_in_group():
    rank = 0
    if hasattr(scg, "get_rank_in_dp_group"):
        rank = scg.get_rank_in_dp_group()
    return rank

@paddle.no_grad()
def grad_sync(param_groups, grad_avg=True):
    """
        sync the gradients of params
    """
    
    nranks = get_world_size()

    if nranks < 2:
        return

    comm_group = scg.dp_group

    for group in param_groups:
        for p in group['params']:
            if p.is_distributed:
                continue

            grad = p.grad
            if grad is None:
                continue

            paddle.distributed.all_reduce(
                grad, sync_op=True, group=comm_group)
            if grad_avg:
                grad = p.grad.scale_(1.0 / nranks)

    return None


@paddle.no_grad()
def param_sync(model, src_rank=0, comm_group=None):
    """
        broadcast params to other ranks
    """

    nranks = paddle.distributed.get_world_size(
    ) if comm_group is None else comm_group.nranks

    if nranks < 2:
        return

    for _, param in model._obtain_parameters_buffers().items():

        if param.is_distributed:
            continue

        if getattr(param, "no_sync", False):
            continue

        paddle.distributed.broadcast(
            param, src=src_rank, group=comm_group, sync_op=True)

    return None


@paddle.no_grad()
def all_reduce(tensor, op=paddle.distributed.ReduceOp.SUM):
    """ allreduce a tensor in bp group """
    if get_world_size() == 1:
        return tensor

    group = scg.dp_group
    paddle.distributed.all_reduce(
        tensor, sync_op=True, op=op, group=group)

    return tensor


================================================
FILE: ppfleetx/distributed/protein_folding/scg.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Communication group manager
"""
import types
import numpy as np
from paddle import distributed as dist


def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
        numerator, denominator)


class SingletonCommunicationGroup(object):
    """ A singleton communication group for hybrid parallel. """

    def __init__(self):
        self.initialized = False

    def init_process_group(self,
                           parallel_degree=[('dp', None)],
                           custom_parallel_degree=None):
        """ init the hybrid parallel process group. In most cases, only one hybrid parallel process group is 
            initialized in a distributed program, so this is a singleton design.
        
            args:
                parallel_degree(list of tuple): Each parallel strategy consists of a tuple.
                E.g. [('dp', None), ('pp', 2), ('mp', 2)], means that the data parallel degree is obtained by 
                calculation, the pipeline parallel degree is 2, and the model parallel degree is 2. For data 
                parallelism, it is special. It is assumed that data parallelism has always been in the outermost 
                dimension. If it is not set, the data parallelism degree will be automatically calculated.
                
                When multiple distributed strategies fully overlap, this can be represented by setting multiple 
                parallel names in a tuple. For example, [('dp', None), ('mp', 'bp', 2)]. Default is [('dp', None)]
                
                custom_parallel_degree(list of tuple): Higher-order usages can be used when the automatically 
                derived parallel strategy fails to meet user needs. The user can calculate the rank id in the 
                communication group and pass it in through the `custom_parallel_degree` arg. Default is None.
                E.g. [('dp', [[0, 2, 4, 6], [1, 3, 5, 7]]), ('mp', 'bp', [[0, 1], [2, 3], [4, 5], [6, 7]])]
                
            note:
                `parallel_degree` and `custom_parallel_degree` are mutually exclusive, only one can be set at 
                the same time.
                
            example 1:
                # 8 gpus on single node, dp will be 2
                # dp_group_ranks = [[0, 4], [1, 5], [2, 6], [3, 7]]
                # pp_group_ranks = [[0, 2], [1, 3], [4, 6], [5, 7]]
                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]
                scg = SingletonCommunicationGroup()
                scg.init_process_group(parallel_degree=[('dp', None), ('pp', 2), ('mp', 2)])
                print(scg.dp_group)
                print(scg.get_rank_in_bp_group())
                print(scg.get_dp_world_size())
                
            example 2:
                # 8 gpus on single node, dp will be 2
                # dp_group_ranks = [[0, 4], [1, 5], [2, 6], [3, 7]]
                # pp_group_ranks = [[0, 2], [1, 3], [4, 6], [5, 7]]
                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]
                scg = SingletonCommunicationGroup()
                scg.init_process_group(parallel_degree=[('pp', 2), ('mp', 2)])
                
            example 3:
                # 8 gpus on single node, dp will be 4, mp and bp share a communication group.
                # dp_group_ranks = [[0, 2, 4, 6], [1, 3, 5, 7]]
                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]
                # bp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]
                scg = SingletonCommunicationGroup()
                scg.init_process_group(parallel_degree=[('dp', None), ('mp', 'bp', 2)])
                
            example 4:
                # 8 gpus on single node, dp will be 8, mp will be 8, dp and mp share a communication group.
                # dp_group_ranks = [[0, 1, 2, 3, 4, 5, 6, 7]]
                # mp_group_ranks = [[0, 1, 2, 3, 4, 5, 6, 7]]
                scg = SingletonCommunicationGroup()
                scg.init_process_group(parallel_degree=[('dp', 'mp', 8)])
                
            example 5:
                # Equal to example 3 but pass config by custom_parallel_degree.
                # dp_group_ranks = [[0, 2, 4, 6], [1, 3, 5, 7]]
                # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]
                # bp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]]
                scg = SingletonCommunicationGroup()
                scg.init_process_group(parallel_degree=None, custom_parallel_degree=[('dp', [[0, 2, 4, 6], [1, 3, 5, 7]]), ('mp', 'bp', [[0, 1], [2, 3], [4, 5], [6, 7]])])
            
        """

        assert not (parallel_degree is not None and custom_parallel_degree is not None), \
            f"parallel_degree and custom_parallel_degree only can be set one."

        assert self.initialized == False, "Communication group is already initialized!"

        if dist.is_initialized() is not None:
            dist.init_parallel_env()

        world_size = dist.get_world_size()
        rank = dist.get_rank()

        # parse parallel_degree
        if parallel_degree is not None and custom_parallel_degree is None:

            def check_valid(inp):
                assert isinstance(
                    inp, list), f"parallel_degree must be list of tuple"
                for item in inp:
                    num_ele = len(item)
                    assert num_ele >= 2, f"each item in parallel_degree must has least two element."
                    assert isinstance(item[-1], (
                        int, type(None)
                    )), f"the last element in each item must be int or None"
                    for idx in range(num_ele - 1):
                        assert isinstance(item[idx], str)

            check_valid(parallel_degree)

            dp_exist = False
            dp_has_set = False
            num_ranks = 1
            for idx, item in enumerate(parallel_degree):
                degree = item[-1]
                if 'dp' in item:
                    assert idx == 0, 'The data parallel dimension must be the outermost dimension.'
                    dp_exist = True

                    if degree is not None:
                        dp_has_set = True
                    else:
                        degree = 1
                assert degree is not None, 'All but dp must specify the parallel degree explicitly.'
                num_ranks *= degree

            # check and update dp
            if not dp_exist:
                assert world_size % num_ranks == 0, 'The total number of parallelism products set is not divisible by the total number of cards.'
                parallel_degree.insert(0, ('dp', world_size // num_ranks))
            elif dp_exist and not dp_has_set:
                assert world_size % num_ranks == 0, 'The total number of parallelism products set is not divisible by the total number of cards.'
                parallel_degree[0] = ('dp', world_size // num_ranks)
            else:
                assert num_ranks == world_size, 'The total number of parallelism products set is not equal to the total number of cards.'

            degrees = tuple([item[-1] for item in parallel_degree])
            num_parallel = len(parallel_degree)
            group_arr = np.arange(0, world_size).reshape(degrees)

            custom_parallel_degree = []

            for idx, item in enumerate(parallel_degree):
                parallel_name = item[0]
                degree = item[-1]
                transpose_axes = []
                for axis in range(num_parallel):
                    if axis != idx:
                        transpose_axes.append(axis)
                transpose_axes.append(idx)
                arr = group_arr.transpose(transpose_axes).reshape((-1, degree))

                custom_parallel_degree.append([])

                for parallel_name in item[:-1]:
                    custom_parallel_degree[idx].append(parallel_name)
                custom_parallel_degree[idx].append([])

                for i in range(world_size // degree):
                    ranks = arr[i].tolist()
                    custom_parallel_degree[idx][-1].append(ranks)
                custom_parallel_degree[idx] = tuple(custom_parallel_degree[
                    idx])
        else:
            print(
                "We do not check the validity of user-defined custom_parallel_degree."
            )

        # new group and set attr
        for item in custom_parallel_degree:
            ranks_list = item[-1]
            for i in range(len(ranks_list)):
                ranks = ranks_list[i]
                for parallel_name in item[:-1]:
                    group = dist.new_group(ranks)
                    print(f'> {parallel_name} ranks: {ranks}')
                    if rank in ranks:
                        setattr(self, f'{parallel_name}_group', group)

                        def get_rank_in_group(parallel_name):
                            def func():
                                if not self.initialized:
                                    return -1
                                group = getattr(self, f'{parallel_name}_group')
                                return group.get_group_rank(dist.get_rank())
                            return func

                        setattr(self, f'get_rank_in_{parallel_name}_group',
                                get_rank_in_group(parallel_name))

                        def get_group_world_size(parallel_name):
                            def func():
                                if not self.initialized:
                                    return -1
                                group = getattr(self, f'{parallel_name}_group')
                                return group.nranks
                            return func

                        setattr(self, f'get_{parallel_name}_world_size',
                                get_group_world_size(parallel_name))

        self.initialized = True

scg = SingletonCommunicationGroup()


================================================
FILE: ppfleetx/models/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import copy

from ppfleetx.core.module.basic_module import BasicModule
from ppfleetx.models.language_model.language_module import GPTModule, GPTGenerationModule, GPTEvalModule, GPTFinetuneModule
from ppfleetx.models.language_model.gpt.auto.auto_module import GPTModuleAuto, GPTGenerationModuleAuto
from ppfleetx.models.vision_model.general_classification_module import GeneralClsModule, GeneralClsModuleAuto
from ppfleetx.models.vision_model.moco_module import MOCOModule, MOCOClsModule
from ppfleetx.models.multimodal_model.multimodal_module import ImagenModule
from ppfleetx.models.language_model.ernie import ErnieModule, ErnieSeqClsModule, ErnieModuleAuto, ErnieSeqClsModuleAuto
from ppfleetx.models.language_model.language_module import MoEModule

from ppfleetx.models.multimodal_model.multimodal_module import ImagenModule


def build_module(config):
    module_name = config.Model.get("module", "BasicModule")
    module = eval(module_name)(config)

    return module


================================================
FILE: ppfleetx/models/language_model/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/auto_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import numpy as np
import paddle.distributed as dist
import paddle.distributed.auto_parallel as auto

from functools import reduce


def process_mesh_config(config):
    class Mesh:
        def __init__(self, config):
            self.dp_dim = None
            self.mp_dim = None
            self.process_mesh = None
            self.config = config

            topology = list(
                filter(lambda x: x > 1, [
                    self.config['pp_degree'], self.config['dp_degree'],
                    self.config['mp_degree']
                ]))
            num_proc = 1 if not topology else reduce(lambda x, y: x * y,
                                                     topology)
            processes = [i for i in range(num_proc)]

            if self.config['pp_degree'] > 1:
                if len(topology) > 1:
                    # dpmppp, dppp, mppp
                    if len(topology) > 2:
                        # dpmppp
                        self.process_mesh = auto.ProcessMesh(
                            np.array(processes).reshape(topology),
                            dim_names=['pp', 'dp', 'mp'])
                        self.dp_dim = 'dp'
                        self.mp_dim = 'mp'
                    elif self.config['dp_degree'] > 1:
                        # dppp
                        self.process_mesh = auto.ProcessMesh(
                            np.array(processes).reshape(topology),
                            dim_names=['pp', 'dp'])
                        self.dp_dim = 'dp'
                    elif self.config['mp_degree'] > 1:
                        # mppp
                        self.process_mesh = auto.ProcessMesh(
                            np.array(processes).reshape(topology),
                            dim_names=['pp', 'mp'])
                        self.mp_dim = 'mp'
                elif len(topology) == 1:
                    # pp
                    self.process_mesh = auto.ProcessMesh(
                        processes, dim_names=['pp'])
            else:
                if len(topology) > 1:
                    # dpmp
                    self.process_mesh = auto.ProcessMesh(
                        np.array(processes).reshape(topology),
                        dim_names=['dp', 'mp'])
                    self.dp_dim = 'dp'
                    self.mp_dim = 'mp'
                elif self.config['dp_degree'] > 1:
                    # dp
                    self.process_mesh = auto.ProcessMesh(
                        processes, dim_names=['dp'])
                    self.dp_dim = 'dp'
                elif self.config['mp_degree'] > 1:
                    # mp
                    self.process_mesh = auto.ProcessMesh(
                        processes, dim_names=['mp'])
                    self.mp_dim = 'mp'
                else:
                    # serial
                    self.process_mesh = auto.ProcessMesh(processes)

        def __getitem__(self, idx):

            if 'pp' in self.process_mesh.dim_names:
                return self.process_mesh[idx]

            return self.process_mesh

        def stages(self, num_layers):
            layer_per_stage = num_layers // self.config['pp_degree']
            return [i // layer_per_stage for i in range(num_layers)]

        @property
        def dp(self):
            return self.dp_dim

        @property
        def mp(self):
            return self.mp_dim

    return Mesh(config)


def process_model_configs(config):
    """
    process model configs for auto parallel
    """
    cfg_model = config['Model']
    mesh = process_mesh_config(config['Distributed'])
    cfg_model.update({'mesh': mesh})
    if cfg_model['ffn_hidden_size'] is None:
        cfg_model['ffn_hidden_size'] = 4 * cfg_model['hidden_size']

    if cfg_model['use_recompute']:
        if not cfg_model.get('recompute_granularity', None):
            cfg_model['recompute_granularity'] = 'full'


def process_data_configs(config):
    """
    process data configs for auto parallel
    """
    cfg_global = config['Global']
    cfg_data = config['Data']

    mode_to_num_samples = {
        "Train":
        cfg_global['global_batch_size'] * config['Engine']['max_steps'],
        "Eval": cfg_global['global_batch_size'] *
        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *
        config['Engine']['eval_iters'],
        "Test":
        cfg_global['global_batch_size'] * config['Engine']['test_iters'],
    }

    for mode in ("Train", "Eval", "Test"):
        if mode in cfg_data.keys():
            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[
                mode]
            cfg_data[mode]['dataset']['mode'] = mode
            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']


def process_configs(config):

    process_model_configs(config)
    process_data_configs(config)

    return config


================================================
FILE: ppfleetx/models/language_model/debertav2/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .modeling import (get_debertav2_model, debertav2_encode_text,
                       get_debertav2_encoded_dim)
from ppfleetx.models.language_model.t5 import normal_, constant_init


================================================
FILE: ppfleetx/models/language_model/debertav2/modeling.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Paddle DeBERTa-v2 model."""

from collections.abc import Sequence
from typing import Optional, Tuple, Union
import json

import paddle
from paddle import nn
from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss

from ppfleetx.models.language_model.t5 import (finfo, ACT2FN, ModelOutput,
                                               normal_, constant_init)
from ppfleetx.data.tokenizers.debertav2_tokenizer import debertav2_tokenize

from dataclasses import dataclass


class BaseModelOutput(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state = None
    hidden_states = None
    attentions = None


# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
class XSoftmax(paddle.autograd.PyLayer):
    """
    Masked Softmax which is optimized for saving memory

    Args:
        input (`paddle.tensor`): The input tensor that will apply softmax.
        mask (`paddle.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    ```python
    >>> import paddle 
    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax

    >>> # Make a tensor
    >>> x = paddle.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```"""

    @staticmethod
    def forward(self, input, mask, dim):
        self.dim = dim
        #rmask = ~(mask.cast('bool'))
        #output = input.masked_fill(rmask, paddle.to_tensor(finfo(input.dtype).min))
        mask = mask.cast('bool')
        output = paddle.where(mask == 0,
                              paddle.to_tensor(finfo(input.dtype).min), input)
        output = paddle.nn.functional.softmax(
            output, axis=self.dim, dtype=paddle.float32)
        output = paddle.where(mask == 0, paddle.to_tensor(0.), output)
        return output


# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
class DropoutContext(object):
    def __init__(self):
        self.dropout = 0
        self.mask = None
        self.scale = 1
        self.reuse_mask = True


# Copied from transformers.models.deberta.modeling_deberta.get_mask
def get_mask(input, local_context):
    if not isinstance(local_context, DropoutContext):
        dropout = local_context
        mask = None
    else:
        dropout = local_context.dropout
        dropout *= local_context.scale
        mask = local_context.mask if local_context.reuse_mask else None

    if dropout > 0 and mask is None:
        mask = (1 - paddle.bernoulli(
            paddle.full(
                shape=input.shape, fill_value=1 - dropout))).cast(bool)

    if isinstance(local_context, DropoutContext):
        if local_context.mask is None:
            local_context.mask = mask

    return mask, dropout


# Copied from transformers.models.deberta.modeling_deberta.XDropout
class XDropout(paddle.autograd.PyLayer):
    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""

    @staticmethod
    def forward(ctx, input, local_ctx):
        mask, dropout = get_mask(input, local_ctx)
        ctx.scale = 1.0 / (1 - dropout)
        if dropout > 0:
            output = paddle.where(mask == 1, 0, input)
            return output * ctx.scale
        else:
            return input


# Copied from transformers.models.deberta.modeling_deberta.StableDropout
class StableDropout(nn.Layer):
    """
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    """

    def __init__(self, drop_prob):
        super().__init__()
        self.drop_prob = drop_prob
        self.count = 0
        self.context_stack = None

    def forward(self, x):
        """
        Call the module

        Args:
            x (`paddle.to_tensor`): The input tensor to apply dropout
        """
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())
        return x

    def clear_context(self):
        self.count = 0
        self.context_stack = None

    def init_context(self, reuse_mask=True, scale=1):
        if self.context_stack is None:
            self.context_stack = []
        self.count = 0
        for c in self.context_stack:
            c.reuse_mask = reuse_mask
            c.scale = scale

    def get_context(self):
        if self.context_stack is not None:
            if self.count >= len(self.context_stack):
                self.context_stack.append(DropoutContext())
            ctx = self.context_stack[self.count]
            ctx.dropout = self.drop_prob
            self.count += 1
            return ctx
        else:
            return self.drop_prob


# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
class DebertaV2SelfOutput(nn.Layer):
    def __init__(self,
                 hidden_size=1536,
                 layer_norm_eps=1e-7,
                 hidden_dropout_prob=0.1):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)
        self.dropout = StableDropout(hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
class DebertaV2Attention(nn.Layer):
    def __init__(
            self,
            hidden_size=512,
            num_attention_heads=24,
            attention_head_size=64,
            share_att_key=True,
            pos_att_type=None,
            relative_attention=True,
            position_buckets=-1,
            max_relative_positions=-1,
            max_position_embeddings=512,
            layer_norm_eps=1e-7,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1, ):
        super().__init__()
        self.self = DisentangledSelfAttention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            attention_head_size=attention_head_size,
            share_att_key=share_att_key,
            pos_att_type=pos_att_type,
            relative_attention=relative_attention,
            position_buckets=position_buckets,
            max_relative_positions=max_relative_positions,
            max_position_embeddings=max_position_embeddings,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob, )
        self.output = DebertaV2SelfOutput(
            hidden_size=hidden_size,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=hidden_dropout_prob)

    def forward(
            self,
            hidden_states,
            attention_mask,
            output_attentions=False,
            query_states=None,
            relative_pos=None,
            rel_embeddings=None, ):
        self_output = self.self(
            hidden_states,
            attention_mask,
            output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings, )
        if output_attentions:
            self_output, att_matrix = self_output
        if query_states is None:
            query_states = hidden_states
        attention_output = self.output(self_output, query_states)

        if output_attentions:
            return (attention_output, att_matrix)
        else:
            return attention_output


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
class DebertaV2Intermediate(nn.Layer):
    def __init__(
            self,
            hidden_size=1536,
            hidden_act='gelu',
            intermediate_size=6144, ):
        super().__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)
        if isinstance(hidden_act, str):
            self.intermediate_act_fn = ACT2FN[hidden_act]
        else:
            self.intermediate_act_fn = hidden_act

    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
class DebertaV2Output(nn.Layer):
    def __init__(
            self,
            hidden_size=512,
            intermediate_size=6144,
            layer_norm_eps=1e-7,
            hidden_dropout_prob=0.1, ):
        super().__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)
        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)
        self.dropout = StableDropout(hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2
class DebertaV2Layer(nn.Layer):
    def __init__(
            self,
            hidden_size=512,
            hidden_act='gelu',
            intermediate_size=6144,
            num_attention_heads=24,
            attention_head_size=64,
            share_att_key=True,
            pos_att_type=None,
            relative_attention=True,
            position_buckets=256,
            max_relative_positions=-1,
            max_position_embeddings=512,
            layer_norm_eps=1e-7,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1, ):
        super().__init__()
        self.attention = DebertaV2Attention(
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            attention_head_size=attention_head_size,
            share_att_key=share_att_key,
            pos_att_type=pos_att_type,
            relative_attention=relative_attention,
            position_buckets=position_buckets,
            max_relative_positions=max_relative_positions,
            max_position_embeddings=max_position_embeddings,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob, )
        self.intermediate = DebertaV2Intermediate(
            hidden_size=hidden_size,
            hidden_act=hidden_act,
            intermediate_size=intermediate_size, )
        self.output = DebertaV2Output(
            hidden_size=hidden_size,
            intermediate_size=intermediate_size,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=hidden_dropout_prob, )

    def forward(
            self,
            hidden_states,
            attention_mask,
            query_states=None,
            relative_pos=None,
            rel_embeddings=None,
            output_attentions=False, ):
        attention_output = self.attention(
            hidden_states,
            attention_mask,
            output_attentions=output_attentions,
            query_states=query_states,
            relative_pos=relative_pos,
            rel_embeddings=rel_embeddings, )
        if output_attentions:
            attention_output, att_matrix = attention_output
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        if output_attentions:
            return (layer_output, att_matrix)
        else:
            return layer_output


class ConvLayer(nn.Layer):
    def __init__(
            self,
            hidden_size=512,
            conv_kernel_size=3,
            conv_groups=1,
            conv_act="tanh",
            layer_norm_eps=1e-7,
            hidden_dropout_prob=0., ):
        super().__init__()
        kernel_size = conv_kernel_size
        groups = conv_groups
        self.conv_act = conv_act
        self.conv = nn.Conv1D(
            hidden_size,
            hidden_size,
            kernel_size,
            padding=(kernel_size - 1) // 2,
            groups=groups)
        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)
        self.dropout = StableDropout(hidden_dropout_prob)

    def forward(self, hidden_states, residual_states, input_mask):
        out = self.conv(hidden_states.transpose([0, 2, 1])).transpose(
            [0, 2, 1])
        out = paddle.where(
            input_mask.cast('bool').unsqueeze(-1).expand(out.shape) == 0,
            paddle.to_tensor(0.), out)
        out = ACT2FN[self.conv_act](self.dropout(out))

        layer_norm_input = residual_states + out
        output = self.LayerNorm(layer_norm_input).cast(layer_norm_input.dtype)

        if input_mask is None:
            output_states = output
        else:
            if input_mask.dim() != layer_norm_input.dim():
                if input_mask.dim() == 4:
                    input_mask = input_mask.squeeze(1).squeeze(1)
                input_mask = input_mask.unsqueeze(2)

            input_mask = input_mask.cast(output.dtype)
            output_states = output * input_mask

        return output_states


class DebertaV2Encoder(nn.Layer):
    """Modified BertEncoder with relative position bias support"""

    def __init__(
            self,
            num_hidden_layers=48,
            num_attention_heads=24,
            attention_head_size=64,
            relative_attention=False,
            max_relative_positions=-1,
            max_position_embeddings=512,
            position_buckets=256,
            hidden_size=1536,
            hidden_act='gelu',
            conv_act='gelu',
            intermediate_size=6144,
            share_att_key=True,
            pos_att_type=None,
            norm_rel_ebd=None,
            conv_kernel_size=0,
            layer_norm_eps=1e-7,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1, ):
        super().__init__()

        self.layer = nn.LayerList([
            DebertaV2Layer(
                hidden_size=hidden_size,
                hidden_act=hidden_act,
                intermediate_size=intermediate_size,
                num_attention_heads=num_attention_heads,
                attention_head_size=attention_head_size,
                share_att_key=share_att_key,
                pos_att_type=pos_att_type,
                relative_attention=relative_attention,
                position_buckets=position_buckets,
                max_relative_positions=max_relative_positions,
                max_position_embeddings=max_position_embeddings,
                layer_norm_eps=layer_norm_eps,
                hidden_dropout_prob=hidden_dropout_prob,
                attention_probs_dropout_prob=attention_probs_dropout_prob)
            for _ in range(num_hidden_layers)
        ])
        self.relative_attention = relative_attention

        if self.relative_attention:
            self.max_relative_positions = max_relative_positions
            if self.max_relative_positions < 1:
                self.max_relative_positions = max_position_embeddings

            self.position_buckets = position_buckets
            pos_ebd_size = self.max_relative_positions * 2

            if self.position_buckets > 0:
                pos_ebd_size = self.position_buckets * 2

            self.rel_embeddings = nn.Embedding(pos_ebd_size, hidden_size)

        self.norm_rel_ebd = [
            x.strip() for x in norm_rel_ebd.lower().split("|")
        ]

        if "layer_norm" in self.norm_rel_ebd:
            self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)

        self.conv = ConvLayer(
            hidden_size=hidden_size,
            conv_kernel_size=conv_kernel_size,
            conv_act=conv_act,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=hidden_dropout_prob,
        ) if conv_kernel_size > 0 else None
        self.gradient_checkpointing = False

    def get_rel_embedding(self):
        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
            rel_embeddings = self.LayerNorm(rel_embeddings)
        return rel_embeddings

    def get_attention_mask(self, attention_mask):
        if attention_mask.dim() <= 2:
            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(
                -2).unsqueeze(-1)
            attention_mask = attention_mask.cast(paddle.uint8)
        elif attention_mask.dim() == 3:
            attention_mask = attention_mask.unsqueeze(1)

        return attention_mask

    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
        if self.relative_attention and relative_pos is None:
            q = query_states.shape[
                -2] if query_states is not None else hidden_states.shape[-2]
            relative_pos = build_relative_position(
                q,
                hidden_states.shape[-2],
                bucket_size=self.position_buckets,
                max_position=self.max_relative_positions)
        return relative_pos

    def forward(
            self,
            hidden_states,
            attention_mask,
            output_hidden_states=True,
            output_attentions=False,
            query_states=None,
            relative_pos=None,
            return_dict=True, ):
        if attention_mask.dim() <= 2:
            input_mask = attention_mask
        else:
            input_mask = (attention_mask.sum(-2) > 0).cast(paddle.uint8)
        attention_mask = self.get_attention_mask(attention_mask)
        relative_pos = self.get_rel_pos(hidden_states, query_states,
                                        relative_pos)

        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        if isinstance(hidden_states, Sequence):
            next_kv = hidden_states[0]
        else:
            next_kv = hidden_states
        rel_embeddings = self.get_rel_embedding()
        output_states = next_kv
        for i, layer_module in enumerate(self.layer):

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (output_states, )

            if self.gradient_checkpointing and self.training:

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs, output_attentions)

                    return custom_forward

                output_states = paddle.utils.checkpoint.checkpoint(
                    create_custom_forward(layer_module),
                    next_kv,
                    attention_mask,
                    query_states,
                    relative_pos,
                    rel_embeddings, )
            else:
                output_states = layer_module(
                    next_kv,
                    attention_mask,
                    query_states=query_states,
                    relative_pos=relative_pos,
                    rel_embeddings=rel_embeddings,
                    output_attentions=output_attentions, )

            if output_attentions:
                output_states, att_m = output_states

            if i == 0 and self.conv is not None:
                output_states = self.conv(hidden_states, output_states,
                                          input_mask)

            if query_states is not None:
                query_states = output_states
                if isinstance(hidden_states, Sequence):
                    next_kv = hidden_states[i + 1] if i + 1 < len(
                        self.layer) else None
            else:
                next_kv = output_states

            if output_attentions:
                all_attentions = all_attentions + (att_m, )

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (output_states, )

        if not return_dict:
            return tuple(
                v for v in [output_states, all_hidden_states, all_attentions]
                if v is not None)
        return BaseModelOutput(
            last_hidden_state=output_states,
            hidden_states=all_hidden_states,
            attentions=all_attentions)


def make_log_bucket_position(relative_pos, bucket_size, max_position):
    sign = paddle.sign(relative_pos.cast('float32'))
    mid = bucket_size // 2
    abs_pos = paddle.where(
        (relative_pos < mid) & (relative_pos > -mid),
        paddle.to_tensor(mid - 1).astype(relative_pos.dtype),
        paddle.abs(relative_pos), )
    log_pos = (paddle.ceil(
        paddle.log(abs_pos / mid) /
        paddle.log(paddle.to_tensor((max_position - 1) / mid)) *
        (mid - 1)) + mid)
    bucket_pos = paddle.where(abs_pos <= mid,
                              relative_pos.cast(log_pos.dtype), log_pos * sign)
    return bucket_pos


def build_relative_position(query_size,
                            key_size,
                            bucket_size=-1,
                            max_position=-1):
    """
    Build relative position according to the query and key

    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
    P_k\\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key
        bucket_size (int): the size of position bucket
        max_position (int): the maximum allowed absolute position

    Return:
        `paddle.LongTensor`: A tensor with shape [1, query_size, key_size]

    """
    q_ids = paddle.arange(0, query_size)
    k_ids = paddle.arange(0, key_size)
    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
    if bucket_size > 0 and max_position > 0:
        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size,
                                               max_position)
    rel_pos_ids = rel_pos_ids.cast(paddle.int64)
    rel_pos_ids = rel_pos_ids[:query_size, :]
    rel_pos_ids = rel_pos_ids.unsqueeze(0)
    return rel_pos_ids


# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
    return c2p_pos.expand([
        query_layer.shape[1], query_layer.shape[1], query_layer.shape[2],
        relative_pos.shape[-1]
    ])


# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
    return c2p_pos.expand([
        query_layer.shape[0], query_layer.shape[1], key_layer.shape[-2],
        key_layer.shape[-2]
    ])


# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
    return pos_index.expand([
        tuplt(p2c_att.shape[:2]) + (pos_index.shape[-2], key_layer.shape[-2])
    ])


class DisentangledSelfAttention(nn.Layer):
    """
    Disentangled self-attention module

    Parameters:

    """

    def __init__(
            self,
            hidden_size=1536,
            num_attention_heads=24,
            attention_head_size=None,
            share_att_key=False,
            pos_att_type=None,
            relative_attention=False,
            position_buckets=-1,
            max_relative_positions=-1,
            max_position_embeddings=512,
            hidden_dropout_prob=0.,
            attention_probs_dropout_prob=0., ):
        super().__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
                f"heads ({num_attention_heads})")
        self.num_attention_heads = num_attention_heads
        _attention_head_size = hidden_size // num_attention_heads
        self.attention_head_size = attention_head_size if attention_head_size is not None else _attention_head_size
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.query_proj = nn.Linear(hidden_size, self.all_head_size)
        self.key_proj = nn.Linear(hidden_size, self.all_head_size)
        self.value_proj = nn.Linear(hidden_size, self.all_head_size)

        self.share_att_key = share_att_key
        self.pos_att_type = pos_att_type if pos_att_type is not None else []
        self.relative_attention = relative_attention

        if self.relative_attention:
            self.position_buckets = position_buckets
            self.max_relative_positions = max_relative_positions
            if self.max_relative_positions < 1:
                self.max_relative_positions = max_position_embeddings
            self.pos_ebd_size = self.max_relative_positions
            if self.position_buckets > 0:
                self.pos_ebd_size = self.position_buckets

            self.pos_dropout = StableDropout(hidden_dropout_prob)

            if not self.share_att_key:
                if "c2p" in self.pos_att_type:
                    self.pos_key_proj = nn.Linear(
                        hidden_size, self.all_head_size, bias=True)
                if "p2c" in self.pos_att_type:
                    self.pos_query_proj = nn.Linear(hidden_size,
                                                    self.all_head_size)

        self.dropout = StableDropout(attention_probs_dropout_prob)

    def transpose_for_scores(self, x, attention_heads):
        new_x_shape = tuple(x.shape[:-1]) + (attention_heads, -1)
        x = x.reshape(new_x_shape)
        return x.transpose([0, 2, 1, 3]).reshape([-1, x.shape[1], x.shape[-1]])

    def forward(
            self,
            hidden_states,
            attention_mask,
            output_attentions=False,
            query_states=None,
            relative_pos=None,
            rel_embeddings=None, ):
        """
        Call the module

        Args:
            hidden_states (`paddle.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`paddle.uint8`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, optional):
                Whether return the attention matrix.

            query_states (`paddle.FloatTensor`, optional):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`paddle.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`paddle.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
                \\text{max_relative_positions}\\), *hidden_size*].


        """
        if query_states is None:
            query_states = hidden_states
        query_layer = self.transpose_for_scores(
            self.query_proj(query_states), self.num_attention_heads)
        key_layer = self.transpose_for_scores(
            self.key_proj(hidden_states), self.num_attention_heads)
        value_layer = self.transpose_for_scores(
            self.value_proj(hidden_states), self.num_attention_heads)

        rel_att = None
        # Take the dot product between "query" and "key" to get the raw attention scores.
        scale_factor = 1
        if "c2p" in self.pos_att_type:
            scale_factor += 1
        if "p2c" in self.pos_att_type:
            scale_factor += 1
        scale = paddle.sqrt(
            paddle.to_tensor(
                query_layer.shape[-1], dtype='float32') * scale_factor)
        attention_scores = paddle.bmm(
            query_layer, key_layer.transpose(
                [0, 2, 1])) / scale.cast(query_layer.dtype)
        if self.relative_attention:
            rel_embeddings = self.pos_dropout(rel_embeddings)
            rel_att = self.disentangled_attention_bias(
                query_layer, key_layer, relative_pos, rel_embeddings,
                scale_factor)

        if rel_att is not None:
            attention_scores = attention_scores + rel_att
        attention_scores = attention_scores
        attention_scores = attention_scores.reshape([
            -1, self.num_attention_heads, attention_scores.shape[-2],
            attention_scores.shape[-1]
        ])

        # bsz x height x length x dimension
        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
        attention_probs = self.dropout(attention_probs)
        context_layer = paddle.bmm(
            attention_probs.reshape(
                [-1, attention_probs.shape[-2], attention_probs.shape[-1]]),
            value_layer)
        context_layer = (context_layer.reshape([
            -1, self.num_attention_heads, context_layer.shape[-2],
            context_layer.shape[-1]
        ]).transpose([0, 2, 1, 3]))
        new_context_layer_shape = tuple(context_layer.shape[:-2]) + (-1, )
        context_layer = context_layer.reshape(new_context_layer_shape)
        if output_attentions:
            return (context_layer, attention_probs)
        else:
            return context_layer

    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos,
                                    rel_embeddings, scale_factor):
        if relative_pos is None:
            q = query_layer.shape[-2]
            relative_pos = build_relative_position(
                q,
                key_layer.shape[-2],
                bucket_size=self.position_buckets,
                max_position=self.max_relative_positions)
        if relative_pos.dim() == 2:
            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
        elif relative_pos.dim() == 3:
            relative_pos = relative_pos.unsqueeze(1)
        # bsz x height x query x key
        elif relative_pos.dim() != 4:
            raise ValueError(
                f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}"
            )

        att_span = self.pos_ebd_size
        relative_pos = relative_pos.cast(paddle.int64)

        rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0)
        if self.share_att_key:
            pos_query_layer = paddle.tile(
                self.transpose_for_scores(
                    self.query_proj(rel_embeddings), self.num_attention_heads),
                repeat_times=[
                    query_layer.shape[0] // self.num_attention_heads, 1, 1
                ])
            pos_key_layer = paddle.tile(
                self.transpose_for_scores(
                    self.key_proj(rel_embeddings), self.num_attention_heads),
                repeat_times=[
                    query_layer.shape[0] // self.num_attention_heads, 1, 1
                ])
        else:
            if "c2p" in self.pos_att_type:
                pos_key_layer = paddle.tile(
                    self.transpose_for_scores(
                        self.pos_key_proj(rel_embeddings),
                        self.num_attention_heads),
                    repeat_times=[
                        query_layer.shape[0] // self.num_attention_heads, 1, 1
                    ])  # .split(self.all_head_size, dim=-1)
            if "p2c" in self.pos_att_type:
                pos_query_layer = paddle.tile(
                    self.transpose_for_scores(
                        self.pos_query_proj(rel_embeddings),
                        self.num_attention_heads),
                    repeat_times=[
                        query_layer.shape[0] // self.num_attention_heads, 1, 1
                    ])  # .split(self.all_head_size, dim=-1)

        score = 0
        # content->position
        if "c2p" in self.pos_att_type:
            scale = paddle.sqrt(
                paddle.to_tensor(
                    pos_key_layer.shape[-1], dtype='float32') * scale_factor)
            c2p_att = paddle.bmm(query_layer,
                                 pos_key_layer.transpose([0, 2, 1]))
            c2p_pos = paddle.clip(relative_pos + att_span, 0, att_span * 2 - 1)
            c2p_att = paddle.take_along_axis(
                c2p_att,
                axis=-1,
                indices=c2p_pos.squeeze(0).expand([
                    query_layer.shape[0], query_layer.shape[1],
                    relative_pos.shape[-1]
                ]), )
            score += c2p_att / scale.cast(dtype=c2p_att.dtype)

        # position->content
        if "p2c" in self.pos_att_type:
            scale = paddle.sqrt(
                paddle.to_tensor(
                    pos_query_layer.shape[-1], dtype='float32') * scale_factor)
            if key_layer.shape[-2] != query_layer.shape[-2]:
                r_pos = build_relative_position(
                    key_layer.shape[-2],
                    key_layer.shape[-2],
                    bucket_size=self.position_buckets,
                    max_position=self.max_relative_positions, )
                r_pos = r_pos.unsqueeze(0)
            else:
                r_pos = relative_pos

            p2c_pos = paddle.clip(-r_pos + att_span, 0, att_span * 2 - 1)
            p2c_att = paddle.bmm(key_layer,
                                 pos_query_layer.transpose([0, 2, 1]))
            p2c_att = paddle.take_along_axis(
                p2c_att,
                axis=-1,
                indices=p2c_pos.squeeze(0).expand([
                    query_layer.shape[0], key_layer.shape[-2],
                    key_layer.shape[-2]
                ]), ).transpose([0, 2, 1])
            score += p2c_att / scale.cast(dtype=p2c_att.dtype)

        return score


# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm
class DebertaV2Embeddings(nn.Layer):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(
            self,
            max_position_embeddings=512,
            position_biased_input=False,
            pad_token_id=0,
            hidden_size=1536,
            hidden_dropout_prob=0.1,
            embedding_size=None,
            vocab_size=128100,
            type_vocab_size=0,
            layer_norm_eps=1e-7, ):
        super().__init__()
        self.embedding_size = hidden_size if embedding_size is None else embedding_size
        self.word_embeddings = nn.Embedding(
            vocab_size, self.embedding_size, padding_idx=pad_token_id)
        self.type_vocab_size = type_vocab_size
        self.hidden_size = hidden_size

        self.position_biased_input = position_biased_input
        if not self.position_biased_input:
            self.position_embeddings = None
        else:
            self.position_embeddings = nn.Embedding(max_position_embeddings,
                                                    self.embedding_size)

        if type_vocab_size > 0:
            self.token_type_embeddings = nn.Embedding(type_vocab_size,
                                                      self.embedding_size)

        if self.embedding_size != hidden_size:
            self.embed_proj = nn.Linear(self.embedding_size, hidden_size)
        self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps)
        self.dropout = StableDropout(hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids",
                             paddle.arange(max_position_embeddings).expand(
                                 (1, -1)))

    def forward(self,
                input_ids=None,
                token_type_ids=None,
                position_ids=None,
                mask=None,
                inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.shape
        else:
            input_shape = inputs_embeds.shape[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if token_type_ids is None:
            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        if self.position_embeddings is not None:
            position_embeddings = self.position_embeddings(
                position_ids.cast(paddle.int64))
        else:
            position_embeddings = paddle.zeros_like(inputs_embeds)

        embeddings = inputs_embeds
        if self.position_biased_input:
            embeddings += position_embeddings
        if self.type_vocab_size > 0:
            token_type_embeddings = self.token_type_embeddings(token_type_ids)
            embeddings += token_type_embeddings

        if self.embedding_size != self.hidden_size:
            embeddings = self.embed_proj(embeddings)

        embeddings = self.LayerNorm(embeddings)

        if mask is not None:
            if mask.dim() != embeddings.dim():
                if mask.dim() == 4:
                    mask = mask.squeeze(1).squeeze(1)
                mask = mask.unsqueeze(2)
            mask = mask.cast('float32')

            embeddings = embeddings * mask

        embeddings = self.dropout(embeddings)
        return embeddings


# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
class DebertaV2PreTrainedModel(nn.Layer):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    base_model_prefix = "deberta"
    _keys_to_ignore_on_load_missing = ["position_ids"]
    _keys_to_ignore_on_load_unexpected = ["position_embeddings"]
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                constant_init(module.bias, 0.)
        elif isinstance(module, nn.Embedding):
            normal_(module.weight, mean=0.0, std=0.02)
            if module.padding_idx is not None:
                constant_init(module.weight.data[module.padding_idx], 0.)

    def _set_gradient_checkpointing(self, module, value=False):
        if isinstance(module, DebertaV2Encoder):
            module.gradient_checkpointing = value


DEBERTA_START_DOCSTRING = r"""
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [paddle.nn.Layer](https://pytorch.org/docs/stable/nn.html#paddle.nn.Layer) subclass.
    Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage
    and behavior.


    Parameters:
"""


# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
class DebertaV2Model(DebertaV2PreTrainedModel):
    def __init__(self,
                 _name_or_path="cache/deberta-v-xxlarge",
                 attention_head_size=64,
                 attention_probs_dropout_prob=0.1,
                 conv_act="gelu",
                 conv_kernel_size=3,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 hidden_size=1536,
                 initializer_range=0.02,
                 intermediate_size=6144,
                 layer_norm_eps=1e-07,
                 max_position_embeddings=512,
                 max_relative_positions=-1,
                 model_type="deberta-v2",
                 norm_rel_ebd="layer_norm",
                 num_attention_heads=24,
                 num_hidden_layers=48,
                 pad_token_id=0,
                 pooler_dropout=0,
                 pooler_hidden_act="gelu",
                 pooler_hidden_size=1536,
                 pos_att_type=["p2c", "c2p"],
                 position_biased_input=False,
                 position_buckets=256,
                 relative_attention=True,
                 share_att_key=True,
                 type_vocab_size=0,
                 vocab_size=128100,
                 output_attentions=False,
                 output_hidden_states=False,
                 use_return_dict=True):
        super().__init__()

        self.embeddings = DebertaV2Embeddings(
            max_position_embeddings=max_position_embeddings,
            position_biased_input=position_biased_input,
            pad_token_id=pad_token_id,
            hidden_size=hidden_size,
            hidden_dropout_prob=hidden_dropout_prob,
            vocab_size=vocab_size,
            type_vocab_size=type_vocab_size,
            layer_norm_eps=layer_norm_eps)
        self.encoder = DebertaV2Encoder(
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            attention_head_size=attention_head_size,
            relative_attention=relative_attention,
            max_relative_positions=max_relative_positions,
            max_position_embeddings=max_position_embeddings,
            position_buckets=position_buckets,
            hidden_size=hidden_size,
            norm_rel_ebd=norm_rel_ebd,
            conv_kernel_size=conv_kernel_size,
            hidden_act=hidden_act,
            conv_act=conv_act,
            intermediate_size=intermediate_size,
            share_att_key=share_att_key,
            pos_att_type=pos_att_type,
            layer_norm_eps=layer_norm_eps,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob, )
        self.z_steps = 0
        self.output_attentions = output_attentions
        self.output_hidden_states = output_hidden_states
        self.use_return_dict = use_return_dict

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError(
            "The prune function is not implemented in DeBERTa model.")

    def forward(
            self,
            input_ids: Optional[paddle.Tensor]=None,
            attention_mask: Optional[paddle.Tensor]=None,
            token_type_ids: Optional[paddle.Tensor]=None,
            position_ids: Optional[paddle.Tensor]=None,
            inputs_embeds: Optional[paddle.Tensor]=None,
            output_attentions: Optional[bool]=None,
            output_hidden_states: Optional[bool]=None,
            return_dict: Optional[bool]=None, ) -> Union[Tuple,
                                                         BaseModelOutput]:
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
        output_hidden_states = (output_hidden_states
                                if output_hidden_states is not None else
                                self.output_hidden_states)
        return_dict = return_dict if return_dict is not None else self.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.shape
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        if attention_mask is None:
            attention_mask = paddle.ones(input_shape)
        if token_type_ids is None:
            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            mask=attention_mask,
            inputs_embeds=inputs_embeds, )

        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask,
            output_hidden_states=True,
            output_attentions=output_attentions,
            return_dict=return_dict, )
        encoded_layers = encoder_outputs[1]

        if self.z_steps > 1:
            hidden_states = encoded_layers[-2]
            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
            query_states = encoded_layers[-1]
            rel_embeddings = self.encoder.get_rel_embedding()
            attention_mask = self.encoder.get_attention_mask(attention_mask)
            rel_pos = self.encoder.get_rel_pos(embedding_output)
            for layer in layers[1:]:
                query_states = layer(
                    hidden_states,
                    attention_mask,
                    output_attentions=False,
                    query_states=query_states,
                    relative_pos=rel_pos,
                    rel_embeddings=rel_embeddings, )
                encoded_layers.append(query_states)

        sequence_output = encoded_layers[-1]

        if not return_dict:
            return (sequence_output,
                    ) + encoder_outputs[(1 if output_hidden_states else 2):]

        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states
            if output_hidden_states else None,
            attentions=encoder_outputs.attentions, )


def get_debertav2_model(name, pretrained=True):
    if name is None:
        return None
    model = DebertaV2Model(
        _name_or_path=name,
        attention_head_size=64,
        attention_probs_dropout_prob=0.1,
        conv_act="gelu",
        conv_kernel_size=3,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        hidden_size=1536,
        initializer_range=0.02,
        intermediate_size=6144,
        layer_norm_eps=1e-07,
        max_position_embeddings=512,
        max_relative_positions=-1,
        model_type="deberta-v2",
        norm_rel_ebd="layer_norm",
        num_attention_heads=24,
        num_hidden_layers=48,
        pad_token_id=0,
        pooler_dropout=0,
        pooler_hidden_act="gelu",
        pooler_hidden_size=1536,
        pos_att_type=["p2c", "c2p"],
        position_biased_input=False,
        position_buckets=256,
        relative_attention=True,
        share_att_key=True,
        type_vocab_size=0,
        vocab_size=128100,
        output_attentions=False,
        output_hidden_states=False,
        use_return_dict=True, )
    if pretrained:
        checkpoint = paddle.load(name + '/debertav2.pd', return_numpy=True)
        model.set_state_dict(checkpoint['model'])
    model.eval()
    for p in model.parameters():
        p.stop_gradient = True

    return model


def dict_from_json_file(name):
    with open(name + '/config.json', "r", encoding="utf-8") as reader:
        text = reader.read()
        config_dict = json.loads(text)
        return config_dict


def debertav2_encode_text(debertav2, texts, tokenizer, return_attn_mask=False):
    token_ids, attn_mask = debertav2_tokenize(texts, tokenizer)
    debertav2.eval()
    with paddle.no_grad():
        output = debertav2(input_ids=token_ids, attention_mask=attn_mask)
        encoded_text = output.last_hidden_state.detach()
    attn_mask = attn_mask.cast(bool)
    encoded_text = paddle.where(attn_mask[:, :, None] == 0,
                                paddle.to_tensor(0.), encoded_text)

    if return_attn_mask:
        return encoded_text, attn_mask

    return encoded_text


def get_debertav2_encoded_dim(name):
    return dict_from_json_file(name)['hidden_size']


if __name__ == '__main__':
    model = get_debertav2_model(
        name='/dbq/codes/CL/paddle-imagen/cache/deberta-v-xxlarge',
        pretrained=False)


================================================
FILE: ppfleetx/models/language_model/ernie/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .ernie_module import ErnieModule, ErnieSeqClsModule
from .auto.auto_module import ErnieModuleAuto, ErnieSeqClsModuleAuto


================================================
FILE: ppfleetx/models/language_model/ernie/auto/__init__.py
================================================


================================================
FILE: ppfleetx/models/language_model/ernie/auto/auto_model.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import io
import copy
import logging
import json

import paddle
import paddle.nn as nn
import paddle.distributed.auto_parallel as auto

from paddle.nn import functional as F
from paddle.nn.initializer.lazy_init import _lazy_init_helper
from dataclasses import dataclass, field

from ..layers.model_outputs import (
    BaseModelOutputWithPoolingAndCrossAttentions,
    ModelOutput,
    ErnieForPreTrainingOutput,
    SequenceClassifierOutput, )
from .auto_transformer import TransformerEncoderLayer, TransformerEncoder


class Embedding(nn.Layer):
    def __init__(
            self,
            num_embeddings,
            embedding_dim,
            padding_idx=None,
            sparse=False,
            weight_attr=None,
            name=None, ):
        super().__init__()
        self._num_embeddings = num_embeddings
        self._embedding_dim = embedding_dim
        self._sparse = sparse
        self._is_distributed = False
        self._padding_idx = padding_idx

        if self._num_embeddings <= 0:
            raise ValueError("num_embeddings must be gather than 0")

        if self._embedding_dim <= 0:
            raise ValueError("embedding_dim must be gather than 0")

        padding_idx = (-1 if padding_idx is None else padding_idx
                       if padding_idx >= 0 else (num_embeddings + padding_idx))

        if padding_idx >= num_embeddings or padding_idx < -num_embeddings:
            raise ValueError("padding_idx must be within [-{}, {})".format(
                num_embeddings, num_embeddings))

        self._dtype = self._helper.get_default_dtype()
        self._size = [self._num_embeddings, self._embedding_dim]

        self._weight_attr = weight_attr
        self._remote_prefetch = False
        self._name = name
        self.weight = self.create_parameter(
            attr=self._weight_attr,
            shape=self._size,
            dtype=self._dtype,
            is_bias=False, )

        if paddle.in_dynamic_mode(
        ) and padding_idx != -1 and not _lazy_init_helper.state:
            with paddle.no_grad():
                self.weight[padding_idx] = 0.0

    def forward(self, x):
        return F.embedding(
            x,
            weight=self.weight,
            padding_idx=self._padding_idx,
            sparse=self._sparse,
            name=self._name, )

    def extra_repr(self):
        main_str = '{_num_embeddings}, {_embedding_dim}'
        if self._padding_idx is not None:
            main_str += ', padding_idx={_padding_idx}'
        main_str += ', sparse={_sparse}'
        if self._name is not None:
            main_str += ', name={_name}'
        return main_str.format(**self.__dict__)


class ErnieEmbeddings(nn.Layer):
    r"""
    Include embeddings from word, position and token_type embeddings.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 hidden_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 pad_token_id=0,
                 weight_attr=None,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False,
                 mesh=None):
        super(ErnieEmbeddings, self).__init__()
        self.mesh = mesh

        self.word_embeddings = Embedding(
            vocab_size,
            hidden_size,
            padding_idx=pad_token_id,
            weight_attr=weight_attr)
        self.position_embeddings = nn.Embedding(
            max_position_embeddings, hidden_size, weight_attr=weight_attr)
        self.type_vocab_size = type_vocab_size
        if self.type_vocab_size > 0:
            self.token_type_embeddings = nn.Embedding(
                type_vocab_size, hidden_size, weight_attr=weight_attr)
        self.use_task_id = use_task_id
        self.task_id = task_id
        if self.use_task_id:
            self.task_type_embeddings = nn.Embedding(
                task_type_vocab_size, hidden_size, weight_attr=weight_attr)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                task_type_ids=None,
                inputs_embeds=None,
                past_key_values_length=None):
        if input_ids is not None:
            auto.shard_tensor(self.word_embeddings.weight, self.mesh[0],
                              [self.mesh.mp, None])
            input_shape = paddle.shape(input_ids)
            input_embeddings = self.word_embeddings(input_ids)

        else:
            input_shape = paddle.shape(inputs_embeds)[:-1]
            input_embeddings = inputs_embeds

        if position_ids is None:
            # maybe need use shape op to unify static graph and dynamic graph
            #seq_length = input_ids.shape[1]
            ones = paddle.ones(input_shape, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=1)
            position_ids = seq_length - ones
            if past_key_values_length is not None:
                position_ids += past_key_values_length
            position_ids.stop_gradient = True

        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embeddings + position_embeddings

        if self.type_vocab_size > 0:
            if token_type_ids is None:
                token_type_ids = paddle.zeros(input_shape, dtype="int64")
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            embeddings = embeddings + token_type_embeddings

        if self.use_task_id:
            if task_type_ids is None:
                task_type_ids = paddle.ones(
                    input_shape, dtype="int64") * self.task_id
            task_type_embeddings = self.task_type_embeddings(task_type_ids)
            embeddings = embeddings + task_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class ErniePooler(nn.Layer):
    def __init__(self, hidden_size, weight_attr=None):
        super(ErniePooler, self).__init__()
        self.dense = nn.Linear(
            hidden_size, hidden_size, weight_attr=weight_attr)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class ErnieModelAuto(nn.Layer):
    r"""
    The bare ERNIE Model transformer outputting raw hidden-states.

    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
    and refer to the Paddle documentation for all matter related to general usage and behavior.

    Args:
        vocab_size (int):
            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.
            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.
        hidden_size (int, optional):
            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
        num_hidden_layers (int, optional):
            Number of hidden layers in the Transformer encoder. Defaults to `12`.
        num_attention_heads (int, optional):
            Number of attention heads for each attention layer in the Transformer encoder.
            Defaults to `12`.
        intermediate_size (int, optional):
            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
            Defaults to `3072`.
        hidden_act (str, optional):
            The non-linear activation function in the feed-forward layer.
            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
            are supported. Defaults to `"gelu"`.
        hidden_dropout_prob (float, optional):
            The dropout probability for all fully connected layers in the embeddings and encoder.
            Defaults to `0.1`.
        attention_probs_dropout_prob (float, optional):
            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
            Defaults to `0.1`.
        max_position_embeddings (int, optional):
            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
            sequence. Defaults to `512`.
        type_vocab_size (int, optional):
            The vocabulary size of the `token_type_ids`.
            Defaults to `2`.
        initializer_range (float, optional):
            The standard deviation of the normal initializer for initializing all weight matrices.
            Defaults to `0.02`.
            
            .. note::
                A normal_initializer initializes weight matrices as normal distributions.
                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.

        pad_token_id(int, optional):
            The index of padding token in the token vocabulary.
            Defaults to `0`.

    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 pad_token_id=0,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False,
                 use_recompute=False,
                 mesh=None):
        super(ErnieModelAuto, self).__init__()
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob

        weight_attr = paddle.ParamAttr(
            initializer=nn.initializer.TruncatedNormal(
                mean=0.0, std=self.initializer_range))
        self.embeddings = ErnieEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, pad_token_id,
            weight_attr, task_type_vocab_size, task_id, use_task_id, mesh)

        encoder_layer = TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0,
            weight_attr=weight_attr,
            normalize_before=False,
            mesh=mesh,
            mesh_idx=0)
        self.encoder = TransformerEncoder(
            encoder_layer,
            num_hidden_layers,
            enable_recompute=use_recompute,
            mesh=mesh)

        self.pooler = ErniePooler(hidden_size, weight_attr)
        self.apply(self.init_weights)

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                task_type_ids=None,
                past_key_values=None,
                inputs_embeds=None,
                use_cache=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        Args:
            input_ids (Tensor):
                Indices of input sequence tokens in the vocabulary. They are
                numerical representations of tokens that build the input sequence.
                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
            token_type_ids (Tensor, optional):
                Segment token indices to indicate different portions of the inputs.
                Selected in the range ``[0, type_vocab_size - 1]``.
                If `type_vocab_size` is 2, which means the inputs have two portions.
                Indices can either be 0 or 1:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
                Defaults to `None`, which means we don't add segment embeddings.
            position_ids (Tensor, optional):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
                max_position_embeddings - 1]``.
                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
            attention_mask (Tensor, optional):
                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
                usually the paddings or the subsequent positions.
                Its data type can be int, float and bool.
                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
                [batch_size, num_attention_heads, sequence_length, sequence_length].
                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
                "使" and "用" will have the same value.
                Defaults to `None`, which means nothing needed to be prevented attention to.
             inputs_embeds (Tensor, optional):
                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
                pass an embedded representation directly instead of passing `inputs_ids`.
            past_key_values (tuple(tuple(Tensor)), optional):
                The length of tuple equals to the number of layers, and each inner
                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
                which contains precomputed key and value hidden states of the attention blocks.
                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
                `input_ids` of shape `(batch_size, sequence_length)`.
            use_cache (`bool`, optional):
                If set to `True`, `past_key_values` key value states are returned.
                Defaults to `None`.
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. 
                If `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
            to ordered and not None (depending on the input arguments) fields of
            :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.

        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time."
            )
        elif input_ids is not None:
            input_shape = paddle.shape(input_ids)
        elif inputs_embeds is not None:
            input_shape = paddle.shape(inputs_embeds)[:-1]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        past_key_values_length = None
        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]

        if attention_mask is None:
            attention_mask = paddle.unsqueeze(
                (input_ids == self.pad_token_id
                 ).astype(self.pooler.dense.weight.dtype) * -1e4,
                axis=[1, 2])
            if past_key_values is not None:
                batch_size = past_key_values[0][0].shape[0]
                past_mask = paddle.zeros(
                    [batch_size, 1, 1, past_key_values_length],
                    dtype=attention_mask.dtype)
                attention_mask = paddle.concat(
                    [past_mask, attention_mask], axis=-1)

        # For 2D attention_mask from tokenizer
        elif attention_mask.ndim == 2:
            attention_mask = paddle.unsqueeze(
                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
            attention_mask = (1.0 - attention_mask) * -1e4
        attention_mask.stop_gradient = True

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length)

        self.encoder._use_cache = use_cache  # To be consistent with HF
        encoder_outputs = self.encoder(
            embedding_output,
            src_mask=attention_mask,
            cache=past_key_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        if isinstance(encoder_outputs, type(embedding_output)):
            sequence_output = encoder_outputs
            pooled_output = self.pooler(sequence_output)
            return (sequence_output, pooled_output)
        else:
            sequence_output = encoder_outputs[0]
            pooled_output = self.pooler(sequence_output)
            if not return_dict:
                return (sequence_output, pooled_output) + encoder_outputs[1:]
            return BaseModelOutputWithPoolingAndCrossAttentions(
                last_hidden_state=sequence_output,
                pooler_output=pooled_output,
                past_key_values=encoder_outputs.past_key_values,
                hidden_states=encoder_outputs.hidden_states,
                attentions=encoder_outputs.attentions)

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            # only support dygraph, use truncated_normal and make it inplace
            # and configurable later
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


class ErnieLMPredictionHead(nn.Layer):
    r"""
    Ernie Model with a `language modeling` head on top.
    """

    def __init__(
            self,
            hidden_size,
            vocab_size,
            activation,
            embedding_weights=None,
            weight_attr=None, ):
        super(ErnieLMPredictionHead, self).__init__()

        self.transform = nn.Linear(
            hidden_size, hidden_size, weight_attr=weight_attr)
        self.activation = getattr(nn.functional, activation)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.decoder_weight = self.create_parameter(
            shape=[vocab_size, hidden_size],
            dtype=self.transform.weight.dtype,
            attr=weight_attr,
            is_bias=False)
        # if embedding_weights is None else embedding_weights
        self.decoder_bias = self.create_parameter(
            shape=[self.decoder_weight.shape[0]],
            dtype=self.decoder_weight.dtype,
            is_bias=True)

    def forward(self, hidden_states, masked_positions=None):
        if masked_positions is not None:
            hidden_states = paddle.reshape(hidden_states,
                                           [-1, hidden_states.shape[-1]])
            hidden_states = paddle.tensor.gather(hidden_states,
                                                 masked_positions)
        # gather masked tokens might be more quick
        hidden_states = self.transform(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        # hidden_states = parallel_matmul(hidden_states, self.decoder_weight, True) + self.decoder_bias

        hidden_states = paddle.matmul(
            hidden_states, self.decoder_weight,
            transpose_y=True) + self.decoder_bias

        return hidden_states


class ErniePretrainingHeads(nn.Layer):
    def __init__(
            self,
            hidden_size,
            vocab_size,
            activation,
            embedding_weights=None,
            weight_attr=None, ):
        super(ErniePretrainingHeads, self).__init__()
        self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size,
                                                 activation, embedding_weights,
                                                 weight_attr)
        self.seq_relationship = nn.Linear(
            hidden_size, 2, weight_attr=weight_attr)

    def forward(self, sequence_output, pooled_output, masked_positions=None):
        prediction_scores = self.predictions(sequence_output, masked_positions)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score


class ErnieForPretrainingAuto(nn.Layer):
    r"""
    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head
    on top.

    """

    def __init__(self, ernie):
        super(ErnieForPretrainingAuto, self).__init__()
        self.ernie = ernie
        weight_attr = paddle.ParamAttr(
            initializer=nn.initializer.TruncatedNormal(
                mean=0.0, std=self.ernie.initializer_range))
        self.cls = ErniePretrainingHeads(
            self.ernie.hidden_size,
            self.ernie.vocab_size,
            self.ernie.hidden_act,
            embedding_weights=self.ernie.embeddings.word_embeddings.weight,
            weight_attr=weight_attr, )

        self.apply(self.init_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                masked_positions=None,
                position_ids=None,
                inputs_embeds=None,
                labels=None,
                next_sentence_label=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        Args:
            input_ids (Tensor):
                See :class:`ErnieModel`.
            token_type_ids (Tensor, optional):
                See :class:`ErnieModel`.
            position_ids (Tensor, optional):
                See :class:`ErnieModel`.
            attention_mask (Tensor, optional):
                See :class:`ErnieModel`.
            inputs_embeds(Tensor, optional):
                See :class:`ErnieModel`.
            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
            next_sentence_label (Tensor of shape `(batch_size,)`, optional):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If
                `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`.
            Otherwise it returns a tuple of tensors corresponding to ordered and
            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`.

        """
        # with paddle.static.amp.fp16_guard():
        outputs = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(
            sequence_output, pooled_output, masked_positions)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = paddle.nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(
                prediction_scores.reshape(
                    (-1, paddle.shape(prediction_scores)[-1])),
                labels.reshape((-1, )))
            next_sentence_loss = loss_fct(
                seq_relationship_score.reshape((-1, 2)),
                next_sentence_label.reshape((-1, )))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return (
                (total_loss, ) + output) if total_loss is not None else output

        return ErnieForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions, )

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            # only support dygraph, use truncated_normal and make it inplace
            # and configurable later
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


class ErniePretrainingCriterionAuto(paddle.nn.Layer):
    r"""
    The loss output of Ernie Model during the pretraining:
    a `masked language modeling` head and a `next sentence prediction (classification)` head.

    """

    def __init__(self, with_nsp_loss=True):
        super(ErniePretrainingCriterionAuto, self).__init__()
        self.with_nsp_loss = with_nsp_loss

    def forward(self,
                prediction_scores,
                seq_relationship_score,
                masked_lm_labels,
                next_sentence_labels=None):
        """
        Args:
            prediction_scores(Tensor):
                The scores of masked token prediction. Its data type should be float32.
                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
            seq_relationship_score(Tensor):
                The scores of next sentence prediction. Its data type should be float32 and
                its shape is [batch_size, 2]
            masked_lm_labels(Tensor):
                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
                Otherwise, its shape is [batch_size, mask_token_num, 1]
            next_sentence_labels(Tensor):
                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
                is equal to `seq_relation_labels`. Its data type should be int64 and
                its shape is [batch_size, 1]

        Returns:
            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
            Its data type should be float32 and its shape is [1].

        """

        # with paddle.static.amp.fp16_guard():
        masked_lm_loss = F.cross_entropy(
            prediction_scores,
            masked_lm_labels,
            ignore_index=-1,
            reduction='none')

        if not self.with_nsp_loss:
            return paddle.mean(masked_lm_loss)

        next_sentence_loss = F.cross_entropy(
            seq_relationship_score, next_sentence_labels, reduction='none')
        loss = paddle.mean(masked_lm_loss) + paddle.mean(next_sentence_loss)
        return loss


class ErnieForSequenceClassificationAuto(nn.Layer):
    """
    Ernie Model with a linear layer on top of the output layer,
    designed for sequence classification/regression tasks like GLUE tasks.

    Args:
        ernie (:class:`ErnieModel`):
            An instance of ErnieModel.
        num_classes (int, optional):
            The number of classes. Defaults to `2`.
        dropout (float, optional):
            The dropout probability for output of ERNIE.
            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`
            instance `ernie`. Defaults to None.
    """

    def __init__(self, ernie, num_classes=2, dropout=None):
        super(ErnieForSequenceClassificationAuto, self).__init__()
        self.num_classes = num_classes
        self.ernie = ernie  # allow ernie to be config
        self.dropout = nn.Dropout(dropout if dropout is not None else
                                  self.ernie.hidden_dropout_prob)
        self.classifier = nn.Linear(self.ernie.hidden_size, num_classes)
        self.apply(self.init_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                labels=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        The ErnieForSequenceClassification forward method, overrides the __call__() special method.

        Args:
            input_ids (Tensor):
                See :class:`ErnieModelAuto`.
            token_type_ids (Tensor, optional):
                See :class:`ErnieModelAuto`.
            position_ids(Tensor, optional):
                See :class:`ErnieModelAuto`.
            attention_mask (Tensor, optional):
                See :class:`ErnieModelAuto`.
            labels (Tensor of shape `(batch_size,)`, optional):
                Labels for computing the sequence classification/regression loss.
                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`
                a regression loss is computed (Mean-Square loss), If `num_classes > 1`
                a classification loss is computed (Cross-Entropy).
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If
                `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
            Otherwise it returns a tuple of tensors corresponding to ordered and
            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`.

        """

        outputs = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_classes == 1:
                loss_fct = paddle.nn.MSELoss()
                loss = loss_fct(logits, labels)
            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
                loss_fct = paddle.nn.CrossEntropyLoss()
                loss = loss_fct(
                    logits.reshape((-1, self.num_classes)),
                    labels.reshape((-1, )))
            else:
                loss_fct = paddle.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else (
                output[0] if len(output) == 1 else output)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions, )

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


================================================
FILE: ppfleetx/models/language_model/ernie/auto/auto_module.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import copy

import paddle
from paddle import LazyGuard
from ppfleetx.core.module.basic_module import BasicModule
from ppfleetx.utils.log import logger

from .auto_model import (
    ErnieModelAuto,
    ErnieForPretrainingAuto,
    ErniePretrainingCriterionAuto,
    ErnieForSequenceClassificationAuto, )

from ppfleetx.models.language_model.auto_utils import process_configs, process_mesh_config

import numpy as np


def process_data_configs(config):
    """
    process data configs for hybrid parallel
    """
    cfg_global = config['Global']
    cfg_data = config['Data']

    mode_to_num_samples = {
        "Train":
        cfg_global['global_batch_size'] * config['Engine']['max_steps'],
        "Eval": cfg_global['global_batch_size'] *
        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *
        config['Engine']['eval_iters'],
        "Test":
        cfg_global['global_batch_size'] * config['Engine']['test_iters'],
    }

    for mode in ("Train", "Eval", "Test"):
        if mode in cfg_data.keys():
            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[
                mode]
            cfg_data[mode]['dataset']['mode'] = mode
            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']
            cfg_data[mode]['dataset'].setdefault('binary_head',
                                                 cfg_global['binary_head'])
            cfg_data[mode]['collate_fn'].setdefault(
                'micro_batch_size', cfg_global['micro_batch_size'])


def process_model_configs(config):
    mesh = process_mesh_config(config['Distributed'])
    cfg_model = config['Model']
    hidden_size = cfg_model['hidden_size']
    cfg_model.update({'mesh': mesh})
    cfg_model.setdefault("intermediate_size", hidden_size * 4)


class ErnieModuleAuto(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        super(ErnieModuleAuto, self).__init__(configs)
        self.nranks = paddle.distributed.get_world_size()
        self.binary_head = self.configs['Global']['binary_head']

        self.loss_fn = ErniePretrainingCriterionAuto(self.binary_head)

    def process_configs(self, configs):
        process_data_configs(configs)
        process_model_configs(configs)
        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        with LazyGuard():
            model = ErnieForPretrainingAuto(ErnieModelAuto(**model_setting))

        return model

    def input_spec(self):
        inputs_spec = [
            paddle.static.InputSpec(
                shape=[None, None], name="input_ids", dtype="int64"),
            paddle.static.InputSpec(
                shape=[None, None], name="token_type_ids", dtype="int64"),
            paddle.static.InputSpec(
                shape=[None, None], name="position_ids", dtype="int64"),
        ]

        return inputs_spec


class ErnieSeqClsModuleAuto(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        super(ErnieSeqClsModuleAuto, self).__init__(configs)

    def process_configs(self, configs):
        process_model_configs(configs)

        cfg_global = configs['Global']
        cfg_data = configs['Data']

        for mode in ("Train", "Eval", "Test"):
            if mode in cfg_data.keys():
                cfg_data[mode]['dataset']['mode'] = mode
                cfg_data[mode]['collate_fn'].setdefault(
                    'tokenizer_type',
                    cfg_data[mode]['dataset']['tokenizer_type'])

        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        with LazyGuard():
            model = ErnieForSequenceClassificationAuto(
                ErnieModelAuto(**model_setting))

        return model

    def input_spec(self):
        input_spec = [
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64", name='input_ids'),
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64", name='token_type_ids')
        ]
        return input_spec


================================================
FILE: ppfleetx/models/language_model/ernie/auto/auto_transformer.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO: define the classes of Transformer neural network

import copy
import collections
import numpy as np

import paddle
import paddle.nn.functional as F
import paddle.nn as nn
import paddle.distributed.auto_parallel as auto

from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer
import paddle.tensor as tensor
from paddle.fluid import layers
from paddle import ParamAttr
from paddle.fluid.data_feeder import convert_dtype
from ..layers.model_outputs import BaseModelOutputWithPastAndCrossAttentions

__all__ = []


def _convert_param_attr_to_list(param_attr, n):
    """
    If `param_attr` is a list or tuple, convert every element in it to a
    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
    construct a list, and rename every one by appending a increasing index
    suffix to avoid having same names when `param_attr` contains a name.

    Parameters:
        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
            converted to a ParamAttr instance by `ParamAttr._to_attr`.
        n (int): The times to repeat to construct a list when `param_attr`
            is not a list or tuple.

    Returns:
        list: A list composed of each including cell's `param_attr`.
    """
    if isinstance(param_attr, (list, tuple)):
        assert len(param_attr) == n, (
            "length of param_attr should be %d when it is a list/tuple" % n)
        param_attrs = []
        for attr in param_attr:
            if isinstance(attr, bool):
                if attr:
                    param_attrs.append(ParamAttr._to_attr(None))
                else:
                    param_attrs.append(False)
            else:
                param_attrs.append(ParamAttr._to_attr(attr))
        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
    elif isinstance(param_attr, bool):
        param_attrs = []
        if param_attr:
            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
        else:
            param_attrs = [False] * n
    else:
        param_attrs = []
        attr = ParamAttr._to_attr(param_attr)
        for i in range(n):
            attr_i = copy.deepcopy(attr)
            if attr.name:
                attr_i.name = attr_i.name + "_" + str(i)
            param_attrs.append(attr_i)
    return param_attrs


def _convert_attention_mask(attn_mask, dtype):
    """
    Convert the attention mask to the target dtype we expect.

    Parameters:
        attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
        dtype (VarType): The target type of `attn_mask` we expect.

    Returns:
        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
    """
    if attn_mask is not None and attn_mask.dtype != dtype:
        attn_mask_dtype = convert_dtype(attn_mask.dtype)
        if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype:
            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9
        else:
            attn_mask = paddle.cast(attn_mask, dtype)
    return attn_mask


class MultiHeadAttention(Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention.
        dropout (float, optional): The dropout probability used on attention
            weights to drop some attention targets. 0 for no dropout. Default 0
        kdim (int, optional): The feature size in key. If None, assumed equal to
            `embed_dim`. Default None.
        vdim (int, optional): The feature size in value. If None, assumed equal to
            `embed_dim`. Default None.
        need_weights (bool, optional): Indicate whether to return the attention
            weights. Default False.
        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            If it is set to False, this layer will not have trainable bias parameter.
            See usage for details in :code:`ParamAttr` .

    Examples:

        .. code-block:: python

            import paddle

            # encoder input: [batch_size, sequence_length, d_model]
            query = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, num_heads, query_len, query_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
    """

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 bias_attr=None,
                 mesh=None,
                 mesh_idx=None):
        super(MultiHeadAttention, self).__init__()

        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
                               "but received {}".format(embed_dim))
        assert num_heads > 0, ("Expected num_heads to be greater than 0, "
                               "but received {}".format(num_heads))

        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights
        self.mesh = mesh
        self.mesh_idx = mesh_idx

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.q_proj = Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.k_proj = Linear(
            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.v_proj = Linear(
            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.out_proj = Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)

    def _prepare_qkv(self, query, key, value, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`.
            value (Tensor): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                It is a namedtuple with `k` and `v` as fields, and stores tensors
                shaped `[batch_size, num_heads, length, embed_dim]` which are results
                of linear projection, reshape and transpose calculations in
                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
                fields reserve intermediate results of previous positions, which
                mostly used for decoder self attention. If it is an instance of
                `StaticCache`, `key` and `value` args would be ignored, `k` and
                `v` fields would be used as calculated results on `key` and
                `value`, which mostly used for decoder-encoder cross attention.
                It is only used for inference and should be None for training.
                Default None.

        Returns:
            tuple: A tuple including linear projected keys and values. These two \
                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
                and `[batch_size, n_head, sequence_length, d_value]` separately, \
                and their data types are same as inputs.
        """
        auto.shard_tensor(self.q_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])

        q = self.q_proj(query)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
            cache = self.Cache(k, v)

        return (q, k, v) if cache is None else (q, k, v, cache)

    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.

        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.

        Parameters:
            key (Tensor): The keys for multi-head attention. It is a tensor
                with shape `[batch_size, sequence_length, kdim]`. The data type
                should be float32 or float64.
            value (Tensor): The values for multi-head attention. It is a tensor
                with shape `[batch_size, sequence_length, vdim]`. The data type
                should be float32 or float64.

        Returns:
            tuple: A tuple including transformed keys and values. Their shapes \
                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
                and their data types are same as inputs.
        """
        auto.shard_tensor(self.k_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])
        auto.shard_tensor(self.v_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])

        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
        return k, v

    def gen_cache(self, key, value=None, type=Cache):
        """
        Generates cache for `forward` usage in inference accroding to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.

        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
        which are results of linear projection, reshape and transpose calculations
        in MultiHeadAttention.

        If the generated cache is an instance of `Cache`, `k` and `v` fields
        reserve intermediate result tensors of previous positions, and the tensors
        are incremental among decoding steps, which mostly are used for decoder
        decoder self attention.

        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
        would be used as calculated result tensors on keys an values in `forward`,
        and the tensors keep unchanged among decoding steps, which are mostly used
        for decoder-encoder cross attention.

        The cache is generated as follows:

        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
        results to create an instance of `StaticCache`.

        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
        to create an instance of `Cache`, where `batch_size` is from the first
        dimension of `key`.

        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
        an instance of `Cache`.

        Parameters:
            key (Tensor): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If `value` is None,
                it is only for batch size and data type reference.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, `key` is only
                for batch size reference. Default None.
            type (type): It should be `MultiHeadAttention.StaticCache` or
                `MultiHeadAttention.Cache` to indicate the cache type to generate.

        Returns:
            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
        """
        if type == MultiHeadAttention.StaticCache:  # static_kv
            k, v = self.compute_kv(key, value)
            return self.StaticCache(k, v)
        elif value is None:  # incremental_state
            k = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            v = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            return self.Cache(k, v)
        else:
            # incremental_state with initial value, mainly for usage like UniLM
            return self.Cache(key, value)

    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor, optional): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`. Default None.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`. Default None.
            attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                It is a namedtuple with `k` and `v` as fields, and stores tensors
                shaped `[batch_size, num_heads, length, embed_dim]` which are results
                of linear projection, reshape and transpose calculations in
                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
                fields reserve intermediate results of previous positions, which
                mostly used for decoder self attention. If it is an instance of
                `StaticCache`, `key` and `value` args would be ignored, `k` and
                `v` fields would be used as calculated results on `key` and
                `value`, which mostly used for decoder-encoder cross attention.
                It is only used for inference and should be None for training.
                Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `query`, representing attention output. Or a tuple if \
                `need_weights` is True or `cache` is not None. If `need_weights` \
                is True, except for attention output, the tuple also includes \
                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
                If `cache` is not None, the tuple then includes the new cache \
                having the same type as `cache`, and if it is `StaticCache`, it \
                is same as the input `cache`, if it is `Cache`, the new cache \
                reserves tensors concatanating raw tensors with intermediate \
                results of current query.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if cache is None:
            q, k, v = self._prepare_qkv(query, key, value, cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, cache)

        # scale dot product attention
        product = paddle.matmul(
            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)

        if attn_mask is not None:
            # Support bool or int mask
            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
            product = product + attn_mask

        weights = F.softmax(product)

        if self.dropout:
            # with get_rng_state_tracker().rng_state('local_seed'):
            weights = F.dropout(
                weights,
                self.dropout,
                training=self.training,
                mode="upscale_in_train")

        out = paddle.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        auto.shard_tensor(self.out_proj.weight, self.mesh[self.mesh_idx],
                          [self.mesh.mp, None])
        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if cache is not None:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerEncoderLayer(Layer):
    """
    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
    attention and feedforward network. Before and after each sub-layer, pre-process
    and post-precess would be applied on the input and output accordingly. If
    `normalize_before` is True, pre-process is layer normalization and post-precess
    includes dropout, residual connection. Otherwise, no pre-process and post-precess
    includes dropout, residual connection, layer normalization.

    Parameters:
        d_model (int): The expected feature size in the input and output.
        nhead (int): The number of heads in multi-head attention(MHA).
        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
        dropout (float, optional): The dropout probability used in pre-process
            and post-precess of MHA and FFN sub-layer. Default 0.1
        activation (str, optional): The activation function in the feedforward
            network. Default relu.
        attn_dropout (float, optional): The dropout probability used
            in MHA to drop some attention target. If None, use the value of
            `dropout`. Default None
        act_dropout (float, optional): The dropout probability used after FFN
            activition.  If None, use the value of `dropout`. Default None
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
            The `False` value means the corresponding layer would not have trainable
            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
            which means the default bias parameter property is used.


    Examples:

        .. code-block:: python

            import paddle
            from paddle.nn import TransformerEncoderLayer

            # encoder input: [batch_size, src_len, d_model]
            enc_input = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, n_head, src_len, src_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = TransformerEncoderLayer(128, 2, 512)
            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None,
                 mesh=None,
                 mesh_idx=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerEncoderLayer, self).__init__()

        assert d_model > 0, ("Expected d_model to be greater than 0, "
                             "but received {}".format(d_model))
        assert nhead > 0, ("Expected nhead to be greater than 0, "
                           "but received {}".format(nhead))
        assert dim_feedforward > 0, (
            "Expected dim_feedforward to be greater than 0, "
            "but received {}".format(dim_feedforward))

        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before
        self.mesh = mesh
        self.mesh_idx = mesh_idx

        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0],
            mesh=mesh,
            mesh_idx=mesh_idx)
        self.linear1 = Linear(
            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = Linear(
            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

    def forward(self, src, src_mask=None, cache=None, output_attentions=False):
        r"""
        Applies a Transformer encoder layer on the input.

        Parameters:
            src (Tensor): The input of Transformer encoder layer. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
                The data type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
                See `TransformerEncoderLayer.gen_cache` for more details. It is
                only used for inference and should be None for training. Default
                None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `enc_input`, representing the output of Transformer encoder \
                layer. Or a tuple if `cache` is not None, except for encoder \
                layer output, the tuple includes the new cache which is same \
                as input `cache` argument but `incremental_cache` has an \
                incremental length. See `MultiHeadAttention.gen_cache` and \
                `MultiHeadAttention.forward` for more details.
        """
        self.self_attn.need_weights = output_attentions
        src_mask = _convert_attention_mask(src_mask, src.dtype)

        auto.shard_tensor(self.linear1.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])
        auto.shard_tensor(self.linear2.weight, self.mesh[self.mesh_idx],
                          [self.mesh.mp, None])

        residual = src
        if self.normalize_before:
            src = self.norm1(src)

        attn_outputs = self.self_attn(src, src, src, src_mask, cache)
        if isinstance(attn_outputs, tuple):
            src = attn_outputs[0]
            outputs = attn_outputs[1:]
        else:
            src = attn_outputs
            outputs = None

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)

        return src if outputs is None else (
            (src, ) + outputs[::-1])  # hidden_states, cache, attentions

    def gen_cache(self, src):
        r"""
        Generates cache for `forward` usage. The generated cache is an
        instance of `MultiHeadAttention.Cache`.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, source_length, d_model]`. The data
                type should be float32 or float64.

        Returns:
            incremental_cache: It is an instance of `MultiHeadAttention.Cache` \
                produced by `self_attn.gen_cache`, it reserves two tensors
                shaped `[batch_size, nhead, 0, d_model // nhead]`. See \
                `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                for more details.
        """
        incremental_cache = self.self_attn.gen_cache(
            src, type=self.self_attn.Cache)
        return incremental_cache


class TransformerEncoder(Layer):
    """
    TransformerEncoder is a stack of N encoder layers.

    Parameters:
        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
            would be used as the first layer, and the other layers would be created
            according to the configurations of it.
        num_layers (int): The number of encoder layers to be stacked.
        norm (LayerNorm, optional): the layer normalization component. If provided,
            apply layer normalization on the output of last encoder layer.

    Examples:

        .. code-block:: python

            import paddle
            from paddle.nn import TransformerEncoderLayer, TransformerEncoder

            # encoder input: [batch_size, src_len, d_model]
            enc_input = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, n_head, src_len, src_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = TransformerEncoderLayer(128, 2, 512)
            encoder = TransformerEncoder(encoder_layer, 2)
            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 encoder_layer,
                 num_layers,
                 norm=None,
                 enable_recompute=False,
                 mesh=None):
        super(TransformerEncoder, self).__init__()
        self.stages = mesh.stages(num_layers)
        self.layers = nn.LayerList()
        for i in range(num_layers):
            if i == 0:
                self.layers.append(encoder_layer)
            else:
                encoder_layer._config.update({
                    "mesh": mesh,
                    "mesh_idx": self.stages[i]
                })
                self.layers.append(
                    type(encoder_layer)(**encoder_layer._config))

        self.num_layers = num_layers
        self.norm = norm
        self.enable_recompute = enable_recompute

    def forward(self,
                src,
                src_mask=None,
                cache=None,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=False):
        r"""
        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last encoder
        layer.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, sequence_length, d_model]`. The data
                type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (list, optional): It is a list, and each element in the list
                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.
                See `TransformerEncoder.gen_cache` for more details. It is only
                used for inference and should be None for training. Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `src`, representing the output of Transformer encoder. \
                Or a tuple if `cache` is not None, except for encoder output, \
                the tuple includes the new cache which is same as input `cache` \
                argument but `incremental_cache` in it has an incremental length. \
                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                for more details.
        """
        src_mask = _convert_attention_mask(src_mask, src.dtype)

        output = src
        # To get cache from None when use_cache is True, which is compatible with HF
        # while HF requires decoder. The implementation here uses cache update in the
        # MultiHeadAttention not so efficiently, and maybe optimize it later.
        if cache is None and getattr(self, "_use_cache", False):
            cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)
        # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts
        # to True when cache is not None.
        new_caches = [] if cache is not None and getattr(self, "_use_cache",
                                                         True) else None
        all_attentions = [] if output_attentions else None
        # NOTE: Also includes embeding output which is same as HF.
        all_hidden_states = [output] if output_hidden_states else None
        for i, mod in enumerate(self.layers):
            auto.shard_tensor(
                output, mod.mesh[mod.mesh_idx],
                [mod.mesh.dp] + [None for i in range(len(output.shape) - 1)])

            if self.enable_recompute:
                layer_outputs = auto.recompute(mod)(
                    output, src_mask, None if cache is None else cache[i]
                    if isinstance(cache[i], MultiHeadAttention.Cache) else
                    MultiHeadAttention.Cache(*cache[i]), output_attentions)
            else:
                layer_outputs = mod(
                    output,
                    src_mask=src_mask,
                    cache=None if cache is None else cache[i]
                    if isinstance(cache[i], MultiHeadAttention.Cache) else
                    MultiHeadAttention.Cache(*cache[i]),
                    output_attentions=output_attentions)

            if isinstance(layer_outputs, tuple):
                output = layer_outputs[0]
                outputs = layer_outputs[1:]
            else:
                output = layer_outputs
                outputs = None

            if output_hidden_states:
                all_hidden_states.append(output)
            if output_attentions:
                all_attentions.append(outputs[-1])
            if new_caches is not None:
                new_caches.append(outputs[0] if isinstance(cache[
                    i], MultiHeadAttention.Cache) else (tuple(outputs[0])))

        if self.norm is not None:
            output = self.norm(output)

            if output_hidden_states:
                all_hidden_states[-1] = output

        if not return_dict:
            return output

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=output,
            past_key_values=new_caches,
            hidden_states=all_hidden_states,
            attentions=all_attentions)

    def gen_cache(self, src):
        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is `incremental_cache` produced by
        `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`
        for more details.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, source_length, d_model]`. The data type
                should be float32 or float64.

        Returns:
            list: It is a list, and each element in the list is `incremental_cache`
            produced by `TransformerEncoderLayer.gen_cache`. See
            `TransformerEncoderLayer.gen_cache` for more details.
        """
        cache = [layer.gen_cache(src) for layer in self.layers]
        return cache


================================================
FILE: ppfleetx/models/language_model/ernie/dygraph/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/ernie/dygraph/hybrid_model.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import io
import copy
import logging
import json

import paddle
import paddle.nn as nn
from paddle.nn import functional as F
from dataclasses import dataclass, field

from ..layers.model_outputs import (
    BaseModelOutputWithPoolingAndCrossAttentions,
    ModelOutput,
    ErnieForPreTrainingOutput,
    SequenceClassifierOutput, )

from ..layers.distributed_transformer import TransformerEncoderLayer, TransformerEncoder
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc

from ppfleetx.distributed.apis import env


def parallel_matmul(lm_output, logit_weights, parallel_output):
    """
    """
    hcg = env.get_hcg()
    model_parallel_group = hcg.get_model_parallel_group()
    world_size = hcg.get_model_parallel_world_size()
    rank = hcg.get_model_parallel_rank()

    if world_size > 1:
        input_parallel = paddle.distributed.collective._c_identity(
            lm_output, group=model_parallel_group)

        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)

        if parallel_output:
            return logits

        return paddle.distributed.collective._c_concat(
            logits, group=model_parallel_group)
    else:
        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
        return logits


class ErnieEmbeddings(nn.Layer):
    r"""
    Include embeddings from word, position and token_type embeddings.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 hidden_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 pad_token_id=0,
                 weight_attr=None,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False):
        super(ErnieEmbeddings, self).__init__()

        # self.word_embeddings = nn.Embedding(
        #     vocab_size,
        #     hidden_size,
        #     padding_idx=pad_token_id,
        #     weight_attr=weight_attr)

        self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
            vocab_size, hidden_size, weight_attr=weight_attr)

        self.position_embeddings = nn.Embedding(
            max_position_embeddings, hidden_size, weight_attr=weight_attr)
        self.type_vocab_size = type_vocab_size
        if self.type_vocab_size > 0:
            self.token_type_embeddings = nn.Embedding(
                type_vocab_size, hidden_size, weight_attr=weight_attr)
        self.use_task_id = use_task_id
        self.task_id = task_id
        if self.use_task_id:
            self.task_type_embeddings = nn.Embedding(
                task_type_vocab_size, hidden_size, weight_attr=weight_attr)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                task_type_ids=None,
                inputs_embeds=None,
                past_key_values_length=None):
        if input_ids is not None:
            input_shape = paddle.shape(input_ids)
            input_embeddings = self.word_embeddings(input_ids)

        else:
            input_shape = paddle.shape(inputs_embeds)[:-1]
            input_embeddings = inputs_embeds

        if position_ids is None:
            # maybe need use shape op to unify static graph and dynamic graph
            #seq_length = input_ids.shape[1]
            ones = paddle.ones(input_shape, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=1)
            position_ids = seq_length - ones
            if past_key_values_length is not None:
                position_ids += past_key_values_length
            position_ids.stop_gradient = True

        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embeddings + position_embeddings

        if self.type_vocab_size > 0:
            if token_type_ids is None:
                token_type_ids = paddle.zeros(input_shape, dtype="int64")
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            embeddings = embeddings + token_type_embeddings

        if self.use_task_id:
            if task_type_ids is None:
                task_type_ids = paddle.ones(
                    input_shape, dtype="int64") * self.task_id
            task_type_embeddings = self.task_type_embeddings(task_type_ids)
            embeddings = embeddings + task_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class ErniePooler(nn.Layer):
    def __init__(self, hidden_size, weight_attr=None):
        super(ErniePooler, self).__init__()
        self.dense = nn.Linear(
            hidden_size, hidden_size, weight_attr=weight_attr)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class ErnieModelHybrid(nn.Layer):
    r"""
    The bare ERNIE Model transformer outputting raw hidden-states.

    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
    and refer to the Paddle documentation for all matter related to general usage and behavior.

    Args:
        vocab_size (int):
            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.
            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.
        hidden_size (int, optional):
            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
        num_hidden_layers (int, optional):
            Number of hidden layers in the Transformer encoder. Defaults to `12`.
        num_attention_heads (int, optional):
            Number of attention heads for each attention layer in the Transformer encoder.
            Defaults to `12`.
        intermediate_size (int, optional):
            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
            Defaults to `3072`.
        hidden_act (str, optional):
            The non-linear activation function in the feed-forward layer.
            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
            are supported. Defaults to `"gelu"`.
        hidden_dropout_prob (float, optional):
            The dropout probability for all fully connected layers in the embeddings and encoder.
            Defaults to `0.1`.
        attention_probs_dropout_prob (float, optional):
            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
            Defaults to `0.1`.
        max_position_embeddings (int, optional):
            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
            sequence. Defaults to `512`.
        type_vocab_size (int, optional):
            The vocabulary size of the `token_type_ids`.
            Defaults to `2`.
        initializer_range (float, optional):
            The standard deviation of the normal initializer for initializing all weight matrices.
            Defaults to `0.02`.
            
            .. note::
                A normal_initializer initializes weight matrices as normal distributions.
                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.

        pad_token_id(int, optional):
            The index of padding token in the token vocabulary.
            Defaults to `0`.

    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 pad_token_id=0,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False,
                 use_recompute=False,
                 num_partitions=1):
        super(ErnieModelHybrid, self).__init__()
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob

        weight_attr = paddle.ParamAttr(
            initializer=nn.initializer.TruncatedNormal(
                mean=0.0, std=self.initializer_range))
        self.embeddings = ErnieEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, pad_token_id,
            weight_attr, task_type_vocab_size, task_id, use_task_id)

        encoder_layer = TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0,
            weight_attr=weight_attr,
            normalize_before=False,
            num_partitions=num_partitions)
        self.encoder = TransformerEncoder(
            encoder_layer, num_hidden_layers, enable_recompute=use_recompute)

        self.pooler = ErniePooler(hidden_size, weight_attr)
        self.apply(self.init_weights)

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                task_type_ids=None,
                past_key_values=None,
                inputs_embeds=None,
                use_cache=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        Args:
            input_ids (Tensor):
                Indices of input sequence tokens in the vocabulary. They are
                numerical representations of tokens that build the input sequence.
                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
            token_type_ids (Tensor, optional):
                Segment token indices to indicate different portions of the inputs.
                Selected in the range ``[0, type_vocab_size - 1]``.
                If `type_vocab_size` is 2, which means the inputs have two portions.
                Indices can either be 0 or 1:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
                Defaults to `None`, which means we don't add segment embeddings.
            position_ids (Tensor, optional):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
                max_position_embeddings - 1]``.
                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
            attention_mask (Tensor, optional):
                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
                usually the paddings or the subsequent positions.
                Its data type can be int, float and bool.
                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
                [batch_size, num_attention_heads, sequence_length, sequence_length].
                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
                "使" and "用" will have the same value.
                Defaults to `None`, which means nothing needed to be prevented attention to.
             inputs_embeds (Tensor, optional):
                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
                pass an embedded representation directly instead of passing `inputs_ids`.
            past_key_values (tuple(tuple(Tensor)), optional):
                The length of tuple equals to the number of layers, and each inner
                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
                which contains precomputed key and value hidden states of the attention blocks.
                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
                `input_ids` of shape `(batch_size, sequence_length)`.
            use_cache (`bool`, optional):
                If set to `True`, `past_key_values` key value states are returned.
                Defaults to `None`.
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. 
                If `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
            to ordered and not None (depending on the input arguments) fields of
            :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.

        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time."
            )
        elif input_ids is not None:
            input_shape = paddle.shape(input_ids)
        elif inputs_embeds is not None:
            input_shape = paddle.shape(inputs_embeds)[:-1]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        past_key_values_length = None
        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]

        if attention_mask is None:
            attention_mask = paddle.unsqueeze(
                (input_ids == self.pad_token_id
                 ).astype(self.pooler.dense.weight.dtype) * -1e4,
                axis=[1, 2])
            if past_key_values is not None:
                batch_size = past_key_values[0][0].shape[0]
                past_mask = paddle.zeros(
                    [batch_size, 1, 1, past_key_values_length],
                    dtype=attention_mask.dtype)
                attention_mask = paddle.concat(
                    [past_mask, attention_mask], axis=-1)

        # For 2D attention_mask from tokenizer
        elif attention_mask.ndim == 2:
            attention_mask = paddle.unsqueeze(
                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
            attention_mask = (1.0 - attention_mask) * -1e4
        attention_mask.stop_gradient = True

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length)

        self.encoder._use_cache = use_cache  # To be consistent with HF
        encoder_outputs = self.encoder(
            embedding_output,
            src_mask=attention_mask,
            cache=past_key_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        if isinstance(encoder_outputs, type(embedding_output)):
            sequence_output = encoder_outputs
            pooled_output = self.pooler(sequence_output)
            return (sequence_output, pooled_output)
        else:
            sequence_output = encoder_outputs[0]
            pooled_output = self.pooler(sequence_output)
            if not return_dict:
                return (sequence_output, pooled_output) + encoder_outputs[1:]
            return BaseModelOutputWithPoolingAndCrossAttentions(
                last_hidden_state=sequence_output,
                pooler_output=pooled_output,
                past_key_values=encoder_outputs.past_key_values,
                hidden_states=encoder_outputs.hidden_states,
                attentions=encoder_outputs.attentions)

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            # only support dygraph, use truncated_normal and make it inplace
            # and configurable later
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


class ErnieLMPredictionHead(nn.Layer):
    r"""
    Ernie Model with a `language modeling` head on top.
    """

    def __init__(
            self,
            hidden_size,
            vocab_size,
            activation,
            embedding_weights=None,
            weight_attr=None, ):
        super(ErnieLMPredictionHead, self).__init__()

        self.transform = nn.Linear(
            hidden_size, hidden_size, weight_attr=weight_attr)
        self.activation = getattr(nn.functional, activation)
        self.layer_norm = nn.LayerNorm(hidden_size)

        # TODO(shenliang03): to support shared weights in future
        self.decoder_weight = self.create_parameter(
            shape=[vocab_size, hidden_size],
            dtype=self.transform.weight.dtype,
            attr=weight_attr,
            is_bias=False)
        # if embedding_weights is None else embedding_weights
        self.decoder_bias = self.create_parameter(
            shape=[self.decoder_weight.shape[0]],
            dtype=self.decoder_weight.dtype,
            is_bias=True)

    def forward(self, hidden_states, masked_positions=None):
        if masked_positions is not None:
            hidden_states = paddle.reshape(hidden_states,
                                           [-1, hidden_states.shape[-1]])
            hidden_states = paddle.tensor.gather(hidden_states,
                                                 masked_positions)
        # gather masked tokens might be more quick
        hidden_states = self.transform(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        # hidden_states = parallel_matmul(hidden_states, self.decoder_weight, True) + self.decoder_bias

        hidden_states = paddle.matmul(
            hidden_states, self.decoder_weight,
            transpose_y=True) + self.decoder_bias

        return hidden_states


class ErniePretrainingHeads(nn.Layer):
    def __init__(
            self,
            hidden_size,
            vocab_size,
            activation,
            embedding_weights=None,
            weight_attr=None, ):
        super(ErniePretrainingHeads, self).__init__()
        self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size,
                                                 activation, embedding_weights,
                                                 weight_attr)
        self.seq_relationship = nn.Linear(
            hidden_size, 2, weight_attr=weight_attr)

    def forward(self, sequence_output, pooled_output, masked_positions=None):
        prediction_scores = self.predictions(sequence_output, masked_positions)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score


class ErnieForPretrainingHybrid(nn.Layer):
    r"""
    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head
    on top.

    """

    def __init__(self, ernie):
        super(ErnieForPretrainingHybrid, self).__init__()
        self.ernie = ernie
        weight_attr = paddle.ParamAttr(
            initializer=nn.initializer.TruncatedNormal(
                mean=0.0, std=self.ernie.initializer_range))
        self.cls = ErniePretrainingHeads(
            self.ernie.hidden_size,
            self.ernie.vocab_size,
            self.ernie.hidden_act,
            embedding_weights=self.ernie.embeddings.word_embeddings.weight,
            weight_attr=weight_attr, )

        self.apply(self.init_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                masked_positions=None,
                inputs_embeds=None,
                labels=None,
                next_sentence_label=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        Args:
            input_ids (Tensor):
                See :class:`ErnieModel`.
            token_type_ids (Tensor, optional):
                See :class:`ErnieModel`.
            position_ids (Tensor, optional):
                See :class:`ErnieModel`.
            attention_mask (Tensor, optional):
                See :class:`ErnieModel`.
            inputs_embeds(Tensor, optional):
                See :class:`ErnieModel`.
            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
            next_sentence_label (Tensor of shape `(batch_size,)`, optional):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If
                `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`.
            Otherwise it returns a tuple of tensors corresponding to ordered and
            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`.

        """
        # with paddle.static.amp.fp16_guard():
        outputs = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(
            sequence_output, pooled_output, masked_positions)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            if env.get_hcg().get_model_parallel_world_size > 1 and paddle.is_compiled_with_cuda():
                loss_fct = fleet.meta_parallel.ParallelCrossEntropy()
            else:
                loss_fct = paddle.nn.CrossEntropyLoss()

            masked_lm_loss = loss_fct(
                prediction_scores.reshape(
                    (-1, paddle.shape(prediction_scores)[-1])),
                labels.reshape((-1, )))
            next_sentence_loss = loss_fct(
                seq_relationship_score.reshape((-1, 2)),
                next_sentence_label.reshape((-1, )))
            total_loss = masked_lm_loss + next_sentence_loss

        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return (
                (total_loss, ) + output) if total_loss is not None else output

        return ErnieForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions, )

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            # only support dygraph, use truncated_normal and make it inplace
            # and configurable later
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


class ErniePretrainingCriterionHybrid(paddle.nn.Layer):
    r"""
    The loss output of Ernie Model during the pretraining:
    a `masked language modeling` head and a `next sentence prediction (classification)` head.

    """

    def __init__(self, with_nsp_loss=True):
        super(ErniePretrainingCriterionHybrid, self).__init__()
        self.with_nsp_loss = with_nsp_loss

    def forward(self,
                prediction_scores,
                seq_relationship_score,
                masked_lm_labels,
                next_sentence_labels=None):
        """
        Args:
            prediction_scores(Tensor):
                The scores of masked token prediction. Its data type should be float32.
                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
            seq_relationship_score(Tensor):
                The scores of next sentence prediction. Its data type should be float32 and
                its shape is [batch_size, 2]
            masked_lm_labels(Tensor):
                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
                Otherwise, its shape is [batch_size, mask_token_num, 1]
            next_sentence_labels(Tensor):
                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
                is equal to `seq_relation_labels`. Its data type should be int64 and
                its shape is [batch_size, 1]

        Returns:
            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
            Its data type should be float32 and its shape is [1].

        """

        # with paddle.static.amp.fp16_guard():
        # hcg = env.get_hcg()
        # mp_size = hcg.get_model_parallel_world_size()

        # if mp_size > 1:
        #     mask = (masked_lm_labels == -1)
        #     masked_lm_labels[mask] = 0
        #     masked_lm_loss = self.parallel_loss_func(
        #         prediction_scores, masked_lm_labels)
        #     masked_lm_loss[mask] = 0.
        # else:
        # masked_lm_loss = self.loss_func(prediction_scores,
        #                                 masked_lm_labels,
        #                                 ignore_index=-1)
        masked_lm_loss = F.cross_entropy(
            prediction_scores,
            masked_lm_labels,
            ignore_index=-1,
            reduction='none')

        if not self.with_nsp_loss:
            return paddle.mean(masked_lm_loss)

        next_sentence_loss = F.cross_entropy(
            seq_relationship_score, next_sentence_labels, reduction='none')
        return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss)


# these Layers is just for PipelineParallel


class EmbeddingsPipe(ErnieEmbeddings):
    @property
    def embedding_weight(self):
        return self.word_embeddings.weight

    def forward(self, tensors):
        input_ids, token_type_ids, attention_mask = tensors

        past_key_values_length = None

        if attention_mask is None:
            attention_mask = paddle.unsqueeze(
                (input_ids == self.pad_token_id
                 ).astype(self.pooler.dense.weight.dtype) * -1e4,
                axis=[1, 2])
            if past_key_values is not None:
                batch_size = past_key_values[0][0].shape[0]
                past_mask = paddle.zeros(
                    [batch_size, 1, 1, past_key_values_length],
                    dtype=attention_mask.dtype)
                attention_mask = paddle.concat(
                    [past_mask, attention_mask], axis=-1)

        # For 2D attention_mask from tokenizer
        elif attention_mask.ndim == 2:
            attention_mask = paddle.unsqueeze(
                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
            attention_mask = (1.0 - attention_mask) * -1e4
        attention_mask.stop_gradient = True

        embeddings = super().forward(
            input_ids=input_ids,
            position_ids=None,
            token_type_ids=token_type_ids,
            task_type_ids=None,
            inputs_embeds=None,
            past_key_values_length=past_key_values_length)

        return attention_mask, embeddings


class TransformerEncoderLayerPipe(TransformerEncoderLayer):
    def forward(self, tensors):
        attention_mask, inputs = tensors
        outputs = super().forward(src=inputs, src_mask=attention_mask)
        return attention_mask, outputs


class LayerNormPipe(nn.LayerNorm):
    def forward(self, tensors):
        _, inputs = tensors
        output = super().forward(inputs)
        return output


class ErniePoolerPipe(ErniePooler):
    def forward(self, args):
        sequence_output = args
        pooled_output = super().forward(sequence_output)
        return sequence_output, pooled_output


class ErniePretrainingCriterionPipe(ErniePretrainingCriterionHybrid):
    def __init__(self, *heads_args, **heads_kargs):
        super(ErniePretrainingCriterionPipe, self).__init__()
        self.heads = ErniePretrainingHeads(*heads_args, **heads_kargs)

    def forward(self, outputs, data):
        sequence_output, pooled_output = outputs
        masked_lm_positions, masked_lm_labels, next_sentence_labels = data

        prediction_scores, seq_relationship_score = self.heads(
            sequence_output, pooled_output, masked_lm_positions)

        lm_loss, sop_loss = super().forward(
            prediction_scores=prediction_scores,
            seq_relationship_score=seq_relationship_score,
            masked_lm_labels=masked_lm_labels,
            next_sentence_labels=next_sentence_labels)

        return lm_loss + sop_loss


class ErnieForPretrainingPipe(PipelineLayer):
    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 pad_token_id=0,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False,
                 use_recompute=False,
                 num_partitions=1):

        self.descs = []
        self.descs.append(
            LayerDesc(
                EmbeddingsPipe,
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                hidden_dropout_prob=hidden_dropout_prob,
                max_position_embeddings=max_position_embeddings,
                type_vocab_size=type_vocab_size,
                pad_token_id=pad_token_id,
                weight_attr=None,
                task_type_vocab_size=task_type_vocab_size,
                task_id=task_id,
                use_task_id=use_task_id))

        for _ in range(num_hidden_layers):
            self.descs.append(
                LayerDesc(
                    TransformerEncoderLayerPipe,
                    d_model=hidden_size,
                    nhead=num_attention_heads,
                    dim_feedforward=intermediate_size,
                    dropout=hidden_dropout_prob,
                    activation=hidden_act,
                    attn_dropout=attention_probs_dropout_prob,
                    act_dropout=hidden_dropout_prob,
                    normalize_before=False,
                    weight_attr=None,
                    bias_attr=None,
                    num_partitions=num_partitions))

        self.descs.append(
            LayerDesc(
                LayerNormPipe, normalized_shape=hidden_size))
        self.descs.append(LayerDesc(ErniePoolerPipe, hidden_size=hidden_size))

        loss_fun = ErniePretrainingCriterionPipe(
            hidden_size=hidden_size,
            vocab_size=vocab_size,
            activation=hidden_act,
            embedding_weights=None,
            weight_attr=paddle.ParamAttr(
                initializer=nn.initializer.TruncatedNormal(
                    mean=0.0, std=initializer_range)))

        super().__init__(
            layers=self.descs,
            loss_fn=loss_fun,
            topology=env.get_hcg().topology(),
            seg_method="layer:TransformerEncoderLayer",
            recompute_interval=1 if use_recompute else 0,
            recompute_ctx={
                "mp_group": env.get_hcg().get_model_parallel_group(),
                "offload": False,
                "partition": False
            })


class ErnieForSequenceClassificationHybrid(nn.Layer):
    """
    Ernie Model with a linear layer on top of the output layer,
    designed for sequence classification/regression tasks like GLUE tasks.

    Args:
        ernie (:class:`ErnieModel`):
            An instance of ErnieModel.
        num_classes (int, optional):
            The number of classes. Defaults to `2`.
        dropout (float, optional):
            The dropout probability for output of ERNIE.
            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`
            instance `ernie`. Defaults to None.
    """

    def __init__(self, ernie, num_classes=2, dropout=None):
        super(ErnieForSequenceClassificationHybrid, self).__init__()
        self.num_classes = num_classes
        self.ernie = ernie  # allow ernie to be config
        self.dropout = nn.Dropout(dropout if dropout is not None else
                                  self.ernie.hidden_dropout_prob)
        self.classifier = nn.Linear(self.ernie.hidden_size, num_classes)
        self.apply(self.init_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                labels=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        The ErnieForSequenceClassification forward method, overrides the __call__() special method.

        Args:
            input_ids (Tensor):
                See :class:`ErnieModelHybrid`.
            token_type_ids (Tensor, optional):
                See :class:`ErnieModelHybrid`.
            position_ids(Tensor, optional):
                See :class:`ErnieModelHybrid`.
            attention_mask (Tensor, optional):
                See :class:`ErnieModelHybrid`.
            labels (Tensor of shape `(batch_size,)`, optional):
                Labels for computing the sequence classification/regression loss.
                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`
                a regression loss is computed (Mean-Square loss), If `num_classes > 1`
                a classification loss is computed (Cross-Entropy).
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If
                `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
            Otherwise it returns a tuple of tensors corresponding to ordered and
            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`.

        """

        outputs = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_classes == 1:
                loss_fct = paddle.nn.MSELoss()
                loss = loss_fct(logits, labels)
            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
                loss_fct = paddle.nn.CrossEntropyLoss()
                loss = loss_fct(
                    logits.reshape((-1, self.num_classes)),
                    labels.reshape((-1, )))
            else:
                loss_fct = paddle.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else (
                output[0] if len(output) == 1 else output)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions, )

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


================================================
FILE: ppfleetx/models/language_model/ernie/dygraph/single_model.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import io
import copy
import logging
import json

import paddle
import paddle.nn as nn
from paddle.nn import functional as F
from dataclasses import dataclass, field

from ..layers.model_outputs import (
    BaseModelOutputWithPoolingAndCrossAttentions,
    ModelOutput,
    ErnieForPreTrainingOutput,
    SequenceClassifierOutput, )
from ..layers.transformer import TransformerEncoderLayer, TransformerEncoder


class ErnieEmbeddings(nn.Layer):
    r"""
    Include embeddings from word, position and token_type embeddings.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 hidden_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 pad_token_id=0,
                 weight_attr=None,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False):
        super(ErnieEmbeddings, self).__init__()

        self.word_embeddings = nn.Embedding(
            vocab_size,
            hidden_size,
            padding_idx=pad_token_id,
            weight_attr=weight_attr)
        self.position_embeddings = nn.Embedding(
            max_position_embeddings, hidden_size, weight_attr=weight_attr)
        self.type_vocab_size = type_vocab_size
        if self.type_vocab_size > 0:
            self.token_type_embeddings = nn.Embedding(
                type_vocab_size, hidden_size, weight_attr=weight_attr)
        self.use_task_id = use_task_id
        self.task_id = task_id
        if self.use_task_id:
            self.task_type_embeddings = nn.Embedding(
                task_type_vocab_size, hidden_size, weight_attr=weight_attr)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                task_type_ids=None,
                inputs_embeds=None,
                past_key_values_length=None):
        if input_ids is not None:
            input_shape = paddle.shape(input_ids)
            input_embeddings = self.word_embeddings(input_ids)
        else:
            input_shape = paddle.shape(inputs_embeds)[:-1]
            input_embeddings = inputs_embeds

        if position_ids is None:
            # maybe need use shape op to unify static graph and dynamic graph
            #seq_length = input_ids.shape[1]
            ones = paddle.ones(input_shape, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=1)
            position_ids = seq_length - ones
            if past_key_values_length is not None:
                position_ids += past_key_values_length
            position_ids.stop_gradient = True

        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embeddings + position_embeddings

        if self.type_vocab_size > 0:
            if token_type_ids is None:
                token_type_ids = paddle.zeros(input_shape, dtype="int64")
            token_type_embeddings = self.token_type_embeddings(token_type_ids)
            embeddings = embeddings + token_type_embeddings

        if self.use_task_id:
            if task_type_ids is None:
                task_type_ids = paddle.ones(
                    input_shape, dtype="int64") * self.task_id
            task_type_embeddings = self.task_type_embeddings(task_type_ids)
            embeddings = embeddings + task_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class ErniePooler(nn.Layer):
    def __init__(self, hidden_size, weight_attr=None):
        super(ErniePooler, self).__init__()
        self.dense = nn.Linear(
            hidden_size, hidden_size, weight_attr=weight_attr)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class ErnieModel(nn.Layer):
    r"""
    The bare ERNIE Model transformer outputting raw hidden-states.

    This model is a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
    and refer to the Paddle documentation for all matter related to general usage and behavior.

    Args:
        vocab_size (int):
            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.
            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.
        hidden_size (int, optional):
            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
        num_hidden_layers (int, optional):
            Number of hidden layers in the Transformer encoder. Defaults to `12`.
        num_attention_heads (int, optional):
            Number of attention heads for each attention layer in the Transformer encoder.
            Defaults to `12`.
        intermediate_size (int, optional):
            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
            Defaults to `3072`.
        hidden_act (str, optional):
            The non-linear activation function in the feed-forward layer.
            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
            are supported. Defaults to `"gelu"`.
        hidden_dropout_prob (float, optional):
            The dropout probability for all fully connected layers in the embeddings and encoder.
            Defaults to `0.1`.
        attention_probs_dropout_prob (float, optional):
            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
            Defaults to `0.1`.
        max_position_embeddings (int, optional):
            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
            sequence. Defaults to `512`.
        type_vocab_size (int, optional):
            The vocabulary size of the `token_type_ids`.
            Defaults to `2`.
        initializer_range (float, optional):
            The standard deviation of the normal initializer for initializing all weight matrices.
            Defaults to `0.02`.
            
            .. note::
                A normal_initializer initializes weight matrices as normal distributions.
                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.

        pad_token_id(int, optional):
            The index of padding token in the token vocabulary.
            Defaults to `0`.

    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 pad_token_id=0,
                 task_type_vocab_size=3,
                 task_id=0,
                 use_task_id=False,
                 use_recompute=False):
        super(ErnieModel, self).__init__()
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range

        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob

        weight_attr = paddle.ParamAttr(
            initializer=nn.initializer.TruncatedNormal(
                mean=0.0, std=self.initializer_range))
        self.embeddings = ErnieEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, pad_token_id,
            weight_attr, task_type_vocab_size, task_id, use_task_id)

        encoder_layer = TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0,
            weight_attr=weight_attr,
            normalize_before=False)
        self.encoder = TransformerEncoder(
            encoder_layer, num_hidden_layers, enable_recompute=use_recompute)

        self.pooler = ErniePooler(hidden_size, weight_attr)
        self.apply(self.init_weights)

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                task_type_ids=None,
                past_key_values=None,
                inputs_embeds=None,
                use_cache=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        Args:
            input_ids (Tensor):
                Indices of input sequence tokens in the vocabulary. They are
                numerical representations of tokens that build the input sequence.
                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
            token_type_ids (Tensor, optional):
                Segment token indices to indicate different portions of the inputs.
                Selected in the range ``[0, type_vocab_size - 1]``.
                If `type_vocab_size` is 2, which means the inputs have two portions.
                Indices can either be 0 or 1:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
                Defaults to `None`, which means we don't add segment embeddings.
            position_ids (Tensor, optional):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
                max_position_embeddings - 1]``.
                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
            attention_mask (Tensor, optional):
                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
                usually the paddings or the subsequent positions.
                Its data type can be int, float and bool.
                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
                [batch_size, num_attention_heads, sequence_length, sequence_length].
                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
                "使" and "用" will have the same value.
                Defaults to `None`, which means nothing needed to be prevented attention to.
             inputs_embeds (Tensor, optional):
                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
                pass an embedded representation directly instead of passing `inputs_ids`.
            past_key_values (tuple(tuple(Tensor)), optional):
                The length of tuple equals to the number of layers, and each inner
                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
                which contains precomputed key and value hidden states of the attention blocks.
                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
                `input_ids` of shape `(batch_size, sequence_length)`.
            use_cache (`bool`, optional):
                If set to `True`, `past_key_values` key value states are returned.
                Defaults to `None`.
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. 
                If `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
            to ordered and not None (depending on the input arguments) fields of
            :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.

        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time."
            )
        elif input_ids is not None:
            input_shape = paddle.shape(input_ids)
        elif inputs_embeds is not None:
            input_shape = paddle.shape(inputs_embeds)[:-1]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        past_key_values_length = None
        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]

        if attention_mask is None:
            attention_mask = paddle.unsqueeze(
                (input_ids == self.pad_token_id
                 ).astype(self.pooler.dense.weight.dtype) * -1e4,
                axis=[1, 2])
            if past_key_values is not None:
                batch_size = past_key_values[0][0].shape[0]
                past_mask = paddle.zeros(
                    [batch_size, 1, 1, past_key_values_length],
                    dtype=attention_mask.dtype)
                attention_mask = paddle.concat(
                    [past_mask, attention_mask], axis=-1)
        # For 2D attention_mask from tokenizer
        elif attention_mask.ndim == 2:
            attention_mask = paddle.unsqueeze(
                attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
            attention_mask = (1.0 - attention_mask) * -1e4
        attention_mask.stop_gradient = True

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length)

        self.encoder._use_cache = use_cache  # To be consistent with HF
        encoder_outputs = self.encoder(
            embedding_output,
            src_mask=attention_mask,
            cache=past_key_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        if isinstance(encoder_outputs, type(embedding_output)):
            sequence_output = encoder_outputs
            pooled_output = self.pooler(sequence_output)
            return (sequence_output, pooled_output)
        else:
            sequence_output = encoder_outputs[0]
            pooled_output = self.pooler(sequence_output)
            if not return_dict:
                return (sequence_output, pooled_output) + encoder_outputs[1:]
            return BaseModelOutputWithPoolingAndCrossAttentions(
                last_hidden_state=sequence_output,
                pooler_output=pooled_output,
                past_key_values=encoder_outputs.past_key_values,
                hidden_states=encoder_outputs.hidden_states,
                attentions=encoder_outputs.attentions)

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            # only support dygraph, use truncated_normal and make it inplace
            # and configurable later
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


class ErnieLMPredictionHead(nn.Layer):
    r"""
    Ernie Model with a `language modeling` head on top.
    """

    def __init__(
            self,
            hidden_size,
            vocab_size,
            activation,
            embedding_weights=None,
            weight_attr=None, ):
        super(ErnieLMPredictionHead, self).__init__()

        self.transform = nn.Linear(
            hidden_size, hidden_size, weight_attr=weight_attr)
        self.activation = getattr(nn.functional, activation)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.decoder_weight = self.create_parameter(
            shape=[vocab_size, hidden_size],
            dtype=self.transform.weight.dtype,
            attr=weight_attr,
            is_bias=False) if embedding_weights is None else embedding_weights
        self.decoder_bias = self.create_parameter(
            shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)

    def forward(self, hidden_states, masked_positions=None):
        if masked_positions is not None:
            hidden_states = paddle.reshape(hidden_states,
                                           [-1, hidden_states.shape[-1]])
            hidden_states = paddle.tensor.gather(hidden_states,
                                                 masked_positions)
        # gather masked tokens might be more quick
        hidden_states = self.transform(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        hidden_states = paddle.matmul(
            hidden_states, self.decoder_weight,
            transpose_y=True) + self.decoder_bias
        return hidden_states


class ErniePretrainingHeads(nn.Layer):
    def __init__(
            self,
            hidden_size,
            vocab_size,
            activation,
            embedding_weights=None,
            weight_attr=None, ):
        super(ErniePretrainingHeads, self).__init__()
        self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size,
                                                 activation, embedding_weights,
                                                 weight_attr)
        self.seq_relationship = nn.Linear(
            hidden_size, 2, weight_attr=weight_attr)

    def forward(self, sequence_output, pooled_output, masked_positions=None):
        prediction_scores = self.predictions(sequence_output, masked_positions)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score


class ErnieForPretraining(nn.Layer):
    r"""
    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head
    on top.

    """

    def __init__(self, ernie):
        super(ErnieForPretraining, self).__init__()
        self.ernie = ernie
        weight_attr = paddle.ParamAttr(
            initializer=nn.initializer.TruncatedNormal(
                mean=0.0, std=self.ernie.initializer_range))
        self.cls = ErniePretrainingHeads(
            self.ernie.hidden_size,
            self.ernie.vocab_size,
            self.ernie.hidden_act,
            embedding_weights=self.ernie.embeddings.word_embeddings.weight,
            weight_attr=weight_attr, )

        self.apply(self.init_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                masked_positions=None,
                inputs_embeds=None,
                labels=None,
                next_sentence_label=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        Args:
            input_ids (Tensor):
                See :class:`ErnieModel`.
            token_type_ids (Tensor, optional):
                See :class:`ErnieModel`.
            position_ids (Tensor, optional):
                See :class:`ErnieModel`.
            attention_mask (Tensor, optional):
                See :class:`ErnieModel`.
            inputs_embeds(Tensor, optional):
                See :class:`ErnieModel`.
            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
            next_sentence_label (Tensor of shape `(batch_size,)`, optional):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If
                `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`.
            Otherwise it returns a tuple of tensors corresponding to ordered and
            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`.

        """

        # with paddle.static.amp.fp16_guard():
        outputs = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        sequence_output, pooled_output = outputs[:2]
        prediction_scores, seq_relationship_score = self.cls(
            sequence_output, pooled_output, masked_positions)

        total_loss = None
        if labels is not None and next_sentence_label is not None:
            loss_fct = paddle.nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(
                prediction_scores.reshape(
                    (-1, paddle.shape(prediction_scores)[-1])),
                labels.reshape((-1, )))
            next_sentence_loss = loss_fct(
                seq_relationship_score.reshape((-1, 2)),
                next_sentence_label.reshape((-1, )))
            total_loss = masked_lm_loss + next_sentence_loss
        if not return_dict:
            output = (prediction_scores, seq_relationship_score) + outputs[2:]
            return (
                (total_loss, ) + output) if total_loss is not None else output

        return ErnieForPreTrainingOutput(
            loss=total_loss,
            prediction_logits=prediction_scores,
            seq_relationship_logits=seq_relationship_score,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions, )

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            # only support dygraph, use truncated_normal and make it inplace
            # and configurable later
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


class ErniePretrainingCriterion(paddle.nn.Layer):
    r"""
    The loss output of Ernie Model during the pretraining:
    a `masked language modeling` head and a `next sentence prediction (classification)` head.

    """

    def __init__(self, with_nsp_loss=True):
        super(ErniePretrainingCriterion, self).__init__()
        self.with_nsp_loss = with_nsp_loss
        #self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)

    def forward(self,
                prediction_scores,
                seq_relationship_score,
                masked_lm_labels,
                next_sentence_labels=None):
        """
        Args:
            prediction_scores(Tensor):
                The scores of masked token prediction. Its data type should be float32.
                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
            seq_relationship_score(Tensor):
                The scores of next sentence prediction. Its data type should be float32 and
                its shape is [batch_size, 2]
            masked_lm_labels(Tensor):
                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
                Otherwise, its shape is [batch_size, mask_token_num, 1]
            next_sentence_labels(Tensor):
                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
                is equal to `seq_relation_labels`. Its data type should be int64 and
                its shape is [batch_size, 1]

        Returns:
            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
            Its data type should be float32 and its shape is [1].

        """

        with paddle.static.amp.fp16_guard():
            masked_lm_loss = F.cross_entropy(
                prediction_scores,
                masked_lm_labels,
                ignore_index=-1,
                reduction='none')

            if not self.with_nsp_loss:
                return paddle.mean(masked_lm_loss)

            next_sentence_loss = F.cross_entropy(
                seq_relationship_score, next_sentence_labels, reduction='none')
            return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss)


class ErnieForSequenceClassification(nn.Layer):
    """
    Ernie Model with a linear layer on top of the output layer,
    designed for sequence classification/regression tasks like GLUE tasks.

    Args:
        ernie (:class:`ErnieModel`):
            An instance of ErnieModel.
        num_classes (int, optional):
            The number of classes. Defaults to `2`.
        dropout (float, optional):
            The dropout probability for output of ERNIE.
            If None, use the same value as `hidden_dropout_prob` of `ErnieModel`
            instance `ernie`. Defaults to None.
    """

    def __init__(self, ernie, num_classes=2, dropout=None):
        super(ErnieForSequenceClassification, self).__init__()
        self.num_classes = num_classes
        self.ernie = ernie  # allow ernie to be config
        self.dropout = nn.Dropout(dropout if dropout is not None else
                                  self.ernie.hidden_dropout_prob)
        self.classifier = nn.Linear(self.ernie.hidden_size, num_classes)
        self.apply(self.init_weights)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                labels=None,
                output_hidden_states=False,
                output_attentions=False,
                return_dict=False):
        r"""
        The ErnieForSequenceClassification forward method, overrides the __call__() special method.

        Args:
            input_ids (Tensor):
                See :class:`ErnieModel`.
            token_type_ids (Tensor, optional):
                See :class:`ErnieModel`.
            position_ids(Tensor, optional):
                See :class:`ErnieModel`.
            attention_mask (Tensor, optional):
                See :class:`ErnieModel`.
            labels (Tensor of shape `(batch_size,)`, optional):
                Labels for computing the sequence classification/regression loss.
                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`
                a regression loss is computed (Mean-Square loss), If `num_classes > 1`
                a classification loss is computed (Cross-Entropy).
            output_hidden_states (bool, optional):
                Whether to return the hidden states of all layers.
                Defaults to `False`.
            output_attentions (bool, optional):
                Whether to return the attentions tensors of all attention layers.
                Defaults to `False`.
            return_dict (bool, optional):
                Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If
                `False`, the output will be a tuple of tensors. Defaults to `False`.

        Returns:
            An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
            Otherwise it returns a tuple of tensors corresponding to ordered and
            not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`.

        """

        outputs = self.ernie(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_classes == 1:
                loss_fct = paddle.nn.MSELoss()
                loss = loss_fct(logits, labels)
            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
                loss_fct = paddle.nn.CrossEntropyLoss()
                loss = loss_fct(
                    logits.reshape((-1, self.num_classes)),
                    labels.reshape((-1, )))
            else:
                loss_fct = paddle.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else (
                output[0] if len(output) == 1 else output)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions, )

    def init_weights(self, layer):
        """ Initialization hook """
        if isinstance(layer, (nn.Linear, nn.Embedding)):
            if isinstance(layer.weight, paddle.Tensor):
                layer.weight.set_value(
                    paddle.tensor.normal(
                        mean=0.0,
                        std=self.initializer_range
                        if hasattr(self, "initializer_range") else
                        self.ernie.initializer_range,
                        shape=layer.weight.shape))
        elif isinstance(layer, nn.LayerNorm):
            layer._epsilon = 1e-12


================================================
FILE: ppfleetx/models/language_model/ernie/ernie_module.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import copy
import yaml
import codecs
from collections.abc import Mapping

import paddle
from paddle.static import InputSpec
import paddle.nn as nn

from ppfleetx.core.module.basic_module import BasicModule
import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.utils.log import logger

from .dygraph.single_model import (
    ErnieModel,
    ErnieForPretraining,
    ErniePretrainingCriterion,
    ErnieForSequenceClassification, )
from .dygraph.hybrid_model import (ErnieModelHybrid, ErnieForPretrainingHybrid,
                                   ErniePretrainingCriterionHybrid,
                                   ErnieForPretrainingPipe,
                                   ErnieForSequenceClassificationHybrid)

from ppfleetx.models.language_model.utils import process_configs

import numpy as np


def process_data_configs(config):
    """
    process data configs for hybrid parallel
    """
    cfg_global = config['Global']
    cfg_data = config['Data']

    mode_to_num_samples = {
        "Train":
        cfg_global['global_batch_size'] * config['Engine']['max_steps'],
        "Eval": cfg_global['global_batch_size'] *
        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *
        config['Engine']['eval_iters'],
        "Test":
        cfg_global['global_batch_size'] * config['Engine']['test_iters'],
    }

    for mode in ("Train", "Eval", "Test"):
        if mode in cfg_data.keys():
            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[
                mode]
            cfg_data[mode]['dataset']['mode'] = mode
            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']
            cfg_data[mode]['sampler']['batch_size'] = cfg_global[
                'local_batch_size']
            cfg_data[mode]['dataset'].setdefault('binary_head',
                                                 cfg_global['binary_head'])
            cfg_data[mode]['loader']['collate_fn'].setdefault(
                'micro_batch_size', cfg_global['micro_batch_size'])


def process_model_configs(config):
    cfg_model = config['Model']
    hidden_size = cfg_model['hidden_size']
    cfg_model.setdefault("intermediate_size", hidden_size * 4)


def process_finetune_configs(task, config):
    cfg_data = config['Data']
    cfg_dist = config['Distributed']
    cfg_optim = config['Optimizer']
    cfg_global = config['Global']
    cfg_engine = config['Engine']

    path = "./ppfleetx/models/language_model/ernie/finetune_configs.yaml"
    with codecs.open(path, 'r', 'utf-8') as file:
        dic = yaml.load(file, Loader=yaml.FullLoader)

    dataset_type = cfg_data.Train.dataset.dataset_type
    assert dataset_type in dic[task].keys(
    ), "{} is an invalid dataset type ! Only support the types of dataset shown in {}".format(
        dataset_type, path)

    num_train_epochs = dic[task][dataset_type].get('num_train_epochs', None)
    if num_train_epochs is not None:
        cfg_engine['num_train_epochs'] = num_train_epochs

    learning_rate = dic[task][dataset_type].get("learning_rate", None)
    if learning_rate is not None:
        cfg_optim['lr']['max_lr'] = learning_rate

    max_seq_length = dic[task][dataset_type].get("max_seq_length", None)
    if max_seq_length is not None:
        for mode in ("Train", "Eval", "Test"):
            if mode in cfg_data.keys():
                cfg_data[mode]['dataset']['max_seq_len'] = max_seq_length

    batch_size = dic[task][dataset_type].get("batch_size", None)
    if batch_size is not None:
        assert batch_size % cfg_global['micro_batch_size'] == 0

        cfg_global['local_batch_size'] = batch_size
        cfg_global['global_batch_size'] = batch_size * cfg_dist[
            'dp_degree'] * cfg_dist['pp_degree']


class ErnieModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        super(ErnieModule, self).__init__(configs)
        self.nranks = paddle.distributed.get_world_size()
        self.binary_head = self.configs['Global']['binary_head']

        if self.nranks > 1:
            self.criterion = ErniePretrainingCriterionHybrid(self.binary_head)
        else:
            self.criterion = ErniePretrainingCriterion(self.binary_head)

    def get_model_size(self, l, h, v, s):
        P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h))
        logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 /
                                                  1000.0))

    def process_configs(self, configs):
        process_data_configs(configs)
        process_model_configs(configs)
        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        l = model_setting['num_hidden_layers']
        h = model_setting['hidden_size']
        v = model_setting['vocab_size']
        s = self.configs.Data.Train.dataset.max_seq_length
        self.get_model_size(l, h, v, s)

        if self.nranks > 1:
            model_setting[
                'num_partitions'] = self.configs.Distributed.mp_degree
            # model = ErnieForPretrainingHybrid(ErnieModelHybrid(**model_setting))

            if self.configs.Distributed.pp_degree == 1:
                model = ErnieForPretrainingHybrid(
                    ErnieModelHybrid(**model_setting))
            else:
                model = ErnieForPretrainingPipe(**model_setting)
        else:
            model = ErnieForPretraining(ErnieModel(**model_setting))

        return model

    def forward(self, tokens):
        return self.model(tokens)

    def pretreating_batch(self, batch):
        if self.configs.Distributed.pp_degree > 1:
            input_ids, segment_ids, input_mask, masked_lm_positions, \
                        masked_lm_labels, next_sentence_labels = batch

            if not isinstance(masked_lm_positions, list):
                masked_lm_positions = [masked_lm_positions]
            if not isinstance(masked_lm_labels, list):
                masked_lm_labels = [masked_lm_labels]

            data = [
                (input_ids, segment_ids, input_mask),
                (masked_lm_positions, masked_lm_labels, next_sentence_labels)
            ]
            return data
        else:
            return batch

    def training_step(self, batch):
        input_ids, segment_ids, input_mask, masked_lm_positions, \
            masked_lm_labels, next_sentence_labels = batch

        # Create the model for the ernie pretrain
        if self.binary_head:
            prediction_scores, seq_relationship_score = self.model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                # position_ids=None,
                attention_mask=input_mask,
                masked_positions=masked_lm_positions)
            lm_loss, sop_loss = self.criterion(
                prediction_scores, seq_relationship_score, masked_lm_labels,
                next_sentence_labels)
            loss = lm_loss + sop_loss
        else:
            prediction_scores = self.model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                # position_ids=None,
                attention_mask=input_mask,
                masked_positions=masked_lm_positions)

            loss = self.criterion(prediction_scores, None, masked_lm_labels)

        return loss

    def training_step_end(self, log_dict):
        speed = 1. / log_dict['train_cost']
        default_global_tokens_num = self.configs.Global.global_batch_size * \
            self.configs.Data.Train.dataset.max_seq_length

        logger.info(
            "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \
            "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed,
               speed * default_global_tokens_num, speed * default_global_tokens_num / self.nranks, log_dict['lr']))

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, None], dtype='int64'), InputSpec(
                    shape=[None, None], dtype='int64'), InputSpec(
                        shape=[None, None], dtype='int64')
        ]


class ErnieSeqClsModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        super(ErnieSeqClsModule, self).__init__(configs)

        self.criterion = nn.loss.CrossEntropyLoss(
        )  # if data_args.label_list else nn.loss.MSELoss()

        self.past_index = -1
        self.past = None
        self.label_names = (["start_positions", "end_positions"] \
            if "QusetionAnswering" in type(self.model).__name__ else ["labels"])

    def process_configs(self, configs):
        process_model_configs(configs)
        process_finetune_configs("SequenceClassification", configs)

        cfg_global = configs['Global']
        cfg_data = configs['Data']

        for mode in ("Train", "Eval", "Test"):
            if mode in cfg_data.keys():
                cfg_data[mode]['dataset']['mode'] = mode
                cfg_data[mode]['sampler']['batch_size'] = cfg_global[
                    'local_batch_size']
                cfg_data[mode]['loader']['collate_fn'].setdefault(
                    'tokenizer_type',
                    cfg_data[mode]['dataset']['tokenizer_type'])

        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        if self.nranks > 1:
            model_setting[
                'num_partitions'] = self.configs.Distributed.mp_degree

            if self.configs.Distributed.pp_degree == 1:
                model = ErnieForSequenceClassificationHybrid(
                    ErnieModelHybrid(**model_setting))
            else:
                raise ValueError(
                    "Pipeline Parallelism is not supported in Sequence \
                    Classification task of Ernie model.")
        else:
            model = ErnieForSequenceClassification(ErnieModel(**model_setting))

        return model

    def prepare_input(self, data):
        """
        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
        """
        if isinstance(data, Mapping):
            return type(data)(
                {k: self.prepare_input(v)
                 for k, v in data.items()})
        elif isinstance(data, (tuple, list)):
            return type(data)(self.prepare_input(v) for v in data)
        elif isinstance(data, paddle.Tensor):
            # kwargs = dict(device=self.args.current_device)
            # update data type for pure fp16
            return data
            # return data.to(**kwargs)
        return data

    def pretreating_batch(self, batch):
        self.has_labels = all(
            batch.get(k) is not None for k in self.label_names)

        batch = self.prepare_input(batch)
        if self.past_index >= 0 and self.past is not None:
            batch["mems"] = self.past

        return batch

    def forward(self, inputs):
        return self.model(**inputs)

    def compute_loss(self, inputs, return_outputs=False):
        if "labels" in inputs:
            labels = inputs.pop("labels")
        elif "start_positions" in inputs and "end_positions" in inputs:
            labels = (inputs.pop("start_positions"),
                      inputs.pop("end_positions"))
        elif "generator_labels" in inputs:
            labels = inputs["generator_labels"]
        else:
            labels = None
        outputs = self(inputs)

        loss = self.criterion(outputs, labels)
        outputs = (loss, outputs)

        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.past_index >= 0:
            self.past = outputs[self.args.past_index]

        # We don't use .loss here since the model may return tuples instead of ModelOutput.
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

    def training_step(self, batch):
        return self.compute_loss(batch)

    def training_step_end(self, log_dict):
        speed = 1. / log_dict['train_cost']
        default_global_tokens_num = self.configs.Global.global_batch_size * \
            self.configs.Data.Train.dataset.max_seq_len

        logger.info(
            "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \
            "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed,
               speed * default_global_tokens_num, speed * default_global_tokens_num / self.nranks, log_dict['lr']))

    def input_spec(self):
        input_spec = [
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64"),  # input_ids
            paddle.static.InputSpec(
                shape=[None, None], dtype="int64")  # segment_ids
        ]
        return input_spec

    def validation_step(self, inputs):
        if self.has_labels:
            loss, outputs = self.compute_loss(inputs, return_outputs=True)
            loss = loss.mean().detach()

        else:
            loss = None

        return loss

    def validation_step_end(self, log_dict):
        speed = 1. / log_dict['eval_cost']
        logger.info(
            "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               log_dict['eval_cost'], speed))


================================================
FILE: ppfleetx/models/language_model/ernie/finetune_configs.yaml
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Datasets which used for sequence classfication
SequenceClassification:
    clue afqmc: 
        num_train_epochs: 4
    clue tnews:
        num_train_epochs: 4
    clue iflytek:
        num_train_epochs: 8
    clue ocnli:
        num_train_epochs: 8
    clue cmnli: 
        num_train_epochs: 3
    clue wsc: 
        num_train_epochs: 50
    clue csl:
        num_train_epochs: 10
        max_seq_length: 256
        batch_size: 32
    xnli_cn:
        learning_rate: 0.0001
        num_train_epochs: 3
        batch_size: 256
    chnsenticorp_v2:
        learning_rate: 0.00005
        batch_size: 16
        num_train_epochs: 8


================================================
FILE: ppfleetx/models/language_model/ernie/layers/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/ernie/layers/distributed_transformer.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO: define the classes of Transformer neural network

import copy
import collections
import numpy as np

import paddle
import paddle.nn.functional as F
import paddle.nn as nn

from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer
import paddle.tensor as tensor
from paddle.fluid import layers
from paddle import ParamAttr
from paddle.fluid.data_feeder import convert_dtype
from .model_outputs import BaseModelOutputWithPastAndCrossAttentions

from paddle.distributed import fleet
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc
from paddle.distributed.fleet.utils import recompute

__all__ = []


def _convert_param_attr_to_list(param_attr, n):
    """
    If `param_attr` is a list or tuple, convert every element in it to a
    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
    construct a list, and rename every one by appending a increasing index
    suffix to avoid having same names when `param_attr` contains a name.

    Parameters:
        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
            converted to a ParamAttr instance by `ParamAttr._to_attr`.
        n (int): The times to repeat to construct a list when `param_attr`
            is not a list or tuple.

    Returns:
        list: A list composed of each including cell's `param_attr`.
    """
    if isinstance(param_attr, (list, tuple)):
        assert len(param_attr) == n, (
            "length of param_attr should be %d when it is a list/tuple" % n)
        param_attrs = []
        for attr in param_attr:
            if isinstance(attr, bool):
                if attr:
                    param_attrs.append(ParamAttr._to_attr(None))
                else:
                    param_attrs.append(False)
            else:
                param_attrs.append(ParamAttr._to_attr(attr))
        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
    elif isinstance(param_attr, bool):
        param_attrs = []
        if param_attr:
            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
        else:
            param_attrs = [False] * n
    else:
        param_attrs = []
        attr = ParamAttr._to_attr(param_attr)
        for i in range(n):
            attr_i = copy.deepcopy(attr)
            if attr.name:
                attr_i.name = attr_i.name + "_" + str(i)
            param_attrs.append(attr_i)
    return param_attrs


def _convert_attention_mask(attn_mask, dtype):
    """
    Convert the attention mask to the target dtype we expect.

    Parameters:
        attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
        dtype (VarType): The target type of `attn_mask` we expect.

    Returns:
        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
    """
    if attn_mask is not None and attn_mask.dtype != dtype:
        attn_mask_dtype = convert_dtype(attn_mask.dtype)
        if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype:
            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9
        else:
            attn_mask = paddle.cast(attn_mask, dtype)
    return attn_mask


class MultiHeadAttention(Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention.
        dropout (float, optional): The dropout probability used on attention
            weights to drop some attention targets. 0 for no dropout. Default 0
        kdim (int, optional): The feature size in key. If None, assumed equal to
            `embed_dim`. Default None.
        vdim (int, optional): The feature size in value. If None, assumed equal to
            `embed_dim`. Default None.
        need_weights (bool, optional): Indicate whether to return the attention
            weights. Default False.
        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            If it is set to False, this layer will not have trainable bias parameter.
            See usage for details in :code:`ParamAttr` .

    Examples:

        .. code-block:: python

            import paddle

            # encoder input: [batch_size, sequence_length, d_model]
            query = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, num_heads, query_len, query_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
    """

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 bias_attr=None,
                 num_partitions=1):
        super(MultiHeadAttention, self).__init__()

        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
                               "but received {}".format(embed_dim))
        assert num_heads > 0, ("Expected num_heads to be greater than 0, "
                               "but received {}".format(num_heads))

        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        assert self.num_heads % num_partitions == 0
        self.num_heads = self.num_heads // num_partitions

        # self.q_proj = Linear(
        #     embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
        # self.k_proj = Linear(
        #     self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
        # self.v_proj = Linear(
        #     self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
        # self.out_proj = Linear(
        #     embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)

        self.q_proj = fleet.meta_parallel.ColumnParallelLinear(
            embed_dim,
            embed_dim,
            weight_attr=weight_attr,
            has_bias=True,
            gather_output=False)

        self.k_proj = fleet.meta_parallel.ColumnParallelLinear(
            self.kdim,
            embed_dim,
            weight_attr=weight_attr,
            has_bias=True,
            gather_output=False)

        self.v_proj = fleet.meta_parallel.ColumnParallelLinear(
            self.vdim,
            embed_dim,
            weight_attr=weight_attr,
            has_bias=True,
            gather_output=False)

        self.out_proj = fleet.meta_parallel.RowParallelLinear(
            embed_dim,
            embed_dim,
            weight_attr=weight_attr,
            has_bias=True,
            input_is_parallel=True)

    def _prepare_qkv(self, query, key, value, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`.
            value (Tensor): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                It is a namedtuple with `k` and `v` as fields, and stores tensors
                shaped `[batch_size, num_heads, length, embed_dim]` which are results
                of linear projection, reshape and transpose calculations in
                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
                fields reserve intermediate results of previous positions, which
                mostly used for decoder self attention. If it is an instance of
                `StaticCache`, `key` and `value` args would be ignored, `k` and
                `v` fields would be used as calculated results on `key` and
                `value`, which mostly used for decoder-encoder cross attention.
                It is only used for inference and should be None for training.
                Default None.

        Returns:
            tuple: A tuple including linear projected keys and values. These two \
                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
                and `[batch_size, n_head, sequence_length, d_value]` separately, \
                and their data types are same as inputs.
        """
        q = self.q_proj(query.clone())
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key.clone(), value.clone())

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
            cache = self.Cache(k, v)

        return (q, k, v) if cache is None else (q, k, v, cache)

    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.

        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.

        Parameters:
            key (Tensor): The keys for multi-head attention. It is a tensor
                with shape `[batch_size, sequence_length, kdim]`. The data type
                should be float32 or float64.
            value (Tensor): The values for multi-head attention. It is a tensor
                with shape `[batch_size, sequence_length, vdim]`. The data type
                should be float32 or float64.

        Returns:
            tuple: A tuple including transformed keys and values. Their shapes \
                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
                and their data types are same as inputs.
        """
        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
        return k, v

    def gen_cache(self, key, value=None, type=Cache):
        """
        Generates cache for `forward` usage in inference accroding to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.

        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
        which are results of linear projection, reshape and transpose calculations
        in MultiHeadAttention.

        If the generated cache is an instance of `Cache`, `k` and `v` fields
        reserve intermediate result tensors of previous positions, and the tensors
        are incremental among decoding steps, which mostly are used for decoder
        decoder self attention.

        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
        would be used as calculated result tensors on keys an values in `forward`,
        and the tensors keep unchanged among decoding steps, which are mostly used
        for decoder-encoder cross attention.

        The cache is generated as follows:

        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
        results to create an instance of `StaticCache`.

        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
        to create an instance of `Cache`, where `batch_size` is from the first
        dimension of `key`.

        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
        an instance of `Cache`.

        Parameters:
            key (Tensor): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If `value` is None,
                it is only for batch size and data type reference.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, `key` is only
                for batch size reference. Default None.
            type (type): It should be `MultiHeadAttention.StaticCache` or
                `MultiHeadAttention.Cache` to indicate the cache type to generate.

        Returns:
            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
        """
        if type == MultiHeadAttention.StaticCache:  # static_kv
            k, v = self.compute_kv(key, value)
            return self.StaticCache(k, v)
        elif value is None:  # incremental_state
            k = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            v = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            return self.Cache(k, v)
        else:
            # incremental_state with initial value, mainly for usage like UniLM
            return self.Cache(key, value)

    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor, optional): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`. Default None.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`. Default None.
            attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                It is a namedtuple with `k` and `v` as fields, and stores tensors
                shaped `[batch_size, num_heads, length, embed_dim]` which are results
                of linear projection, reshape and transpose calculations in
                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
                fields reserve intermediate results of previous positions, which
                mostly used for decoder self attention. If it is an instance of
                `StaticCache`, `key` and `value` args would be ignored, `k` and
                `v` fields would be used as calculated results on `key` and
                `value`, which mostly used for decoder-encoder cross attention.
                It is only used for inference and should be None for training.
                Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `query`, representing attention output. Or a tuple if \
                `need_weights` is True or `cache` is not None. If `need_weights` \
                is True, except for attention output, the tuple also includes \
                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
                If `cache` is not None, the tuple then includes the new cache \
                having the same type as `cache`, and if it is `StaticCache`, it \
                is same as the input `cache`, if it is `Cache`, the new cache \
                reserves tensors concatanating raw tensors with intermediate \
                results of current query.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if cache is None:
            q, k, v = self._prepare_qkv(query, key, value, cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, cache)

        # scale dot product attention
        product = paddle.matmul(
            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)

        if attn_mask is not None:
            # Support bool or int mask
            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
            product = product + attn_mask

        weights = F.softmax(product)

        if self.dropout:
            with get_rng_state_tracker().rng_state('local_seed'):
                weights = F.dropout(
                    weights,
                    self.dropout,
                    training=self.training,
                    mode="upscale_in_train")

        out = paddle.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if cache is not None:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerEncoderLayer(Layer):
    """
    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
    attention and feedforward network. Before and after each sub-layer, pre-process
    and post-precess would be applied on the input and output accordingly. If
    `normalize_before` is True, pre-process is layer normalization and post-precess
    includes dropout, residual connection. Otherwise, no pre-process and post-precess
    includes dropout, residual connection, layer normalization.

    Parameters:
        d_model (int): The expected feature size in the input and output.
        nhead (int): The number of heads in multi-head attention(MHA).
        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
        dropout (float, optional): The dropout probability used in pre-process
            and post-precess of MHA and FFN sub-layer. Default 0.1
        activation (str, optional): The activation function in the feedforward
            network. Default relu.
        attn_dropout (float, optional): The dropout probability used
            in MHA to drop some attention target. If None, use the value of
            `dropout`. Default None
        act_dropout (float, optional): The dropout probability used after FFN
            activition.  If None, use the value of `dropout`. Default None
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
            The `False` value means the corresponding layer would not have trainable
            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
            which means the default bias parameter property is used.


    Examples:

        .. code-block:: python

            import paddle
            from paddle.nn import TransformerEncoderLayer

            # encoder input: [batch_size, src_len, d_model]
            enc_input = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, n_head, src_len, src_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = TransformerEncoderLayer(128, 2, 512)
            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None,
                 num_partitions=1):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerEncoderLayer, self).__init__()

        assert d_model > 0, ("Expected d_model to be greater than 0, "
                             "but received {}".format(d_model))
        assert nhead > 0, ("Expected nhead to be greater than 0, "
                           "but received {}".format(nhead))
        assert dim_feedforward > 0, (
            "Expected dim_feedforward to be greater than 0, "
            "but received {}".format(dim_feedforward))

        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0],
            num_partitions=num_partitions)
        # self.linear1 = Linear(
        # d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
        # self.linear2 = Linear(
        # dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

        self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
            d_model,
            dim_feedforward,
            weight_attr=weight_attrs[1],
            gather_output=False,
            has_bias=True)

        self.linear2 = fleet.meta_parallel.RowParallelLinear(
            dim_feedforward,
            d_model,
            weight_attr=weight_attrs[1],
            input_is_parallel=True,
            has_bias=True)

    def forward(self, src, src_mask=None, cache=None, output_attentions=False):
        r"""
        Applies a Transformer encoder layer on the input.

        Parameters:
            src (Tensor): The input of Transformer encoder layer. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
                The data type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
                See `TransformerEncoderLayer.gen_cache` for more details. It is
                only used for inference and should be None for training. Default
                None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `enc_input`, representing the output of Transformer encoder \
                layer. Or a tuple if `cache` is not None, except for encoder \
                layer output, the tuple includes the new cache which is same \
                as input `cache` argument but `incremental_cache` has an \
                incremental length. See `MultiHeadAttention.gen_cache` and \
                `MultiHeadAttention.forward` for more details.
        """
        self.self_attn.need_weights = output_attentions
        src_mask = _convert_attention_mask(src_mask, src.dtype)

        residual = src
        if self.normalize_before:
            src = self.norm1(src)

        attn_outputs = self.self_attn(src, src, src, src_mask, cache)
        if isinstance(attn_outputs, tuple):
            src = attn_outputs[0]
            outputs = attn_outputs[1:]
        else:
            src = attn_outputs
            outputs = None

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)

        with get_rng_state_tracker().rng_state('global_seed'):
            tgt = self.dropout(self.activation(self.linear1(src)))
            # tgt = residual + self.dropout1(tgt)

        src = self.linear2(tgt)

        with get_rng_state_tracker().rng_state('global_seed'):
            src = residual + self.dropout2(src)

        if not self.normalize_before:
            src = self.norm2(src)

        return src if outputs is None else (
            (src, ) + outputs[::-1])  # hidden_states, cache, attentions

    def gen_cache(self, src):
        r"""
        Generates cache for `forward` usage. The generated cache is an
        instance of `MultiHeadAttention.Cache`.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, source_length, d_model]`. The data
                type should be float32 or float64.

        Returns:
            incremental_cache: It is an instance of `MultiHeadAttention.Cache` \
                produced by `self_attn.gen_cache`, it reserves two tensors
                shaped `[batch_size, nhead, 0, d_model // nhead]`. See \
                `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                for more details.
        """
        incremental_cache = self.self_attn.gen_cache(
            src, type=self.self_attn.Cache)
        return incremental_cache


class TransformerEncoder(Layer):
    """
    TransformerEncoder is a stack of N encoder layers.

    Parameters:
        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
            would be used as the first layer, and the other layers would be created
            according to the configurations of it.
        num_layers (int): The number of encoder layers to be stacked.
        norm (LayerNorm, optional): the layer normalization component. If provided,
            apply layer normalization on the output of last encoder layer.

    Examples:

        .. code-block:: python

            import paddle
            from paddle.nn import TransformerEncoderLayer, TransformerEncoder

            # encoder input: [batch_size, src_len, d_model]
            enc_input = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, n_head, src_len, src_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = TransformerEncoderLayer(128, 2, 512)
            encoder = TransformerEncoder(encoder_layer, 2)
            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 encoder_layer,
                 num_layers,
                 norm=None,
                 enable_recompute=False):
        super(TransformerEncoder, self).__init__()
        self.layers = LayerList([(encoder_layer if i == 0 else
                                  type(encoder_layer)(**encoder_layer._config))
                                 for i in range(num_layers)])
        self.num_layers = num_layers
        self.norm = norm
        self.enable_recompute = enable_recompute

    def forward(self,
                src,
                src_mask=None,
                cache=None,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=False):
        r"""
        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last encoder
        layer.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, sequence_length, d_model]`. The data
                type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (list, optional): It is a list, and each element in the list
                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.
                See `TransformerEncoder.gen_cache` for more details. It is only
                used for inference and should be None for training. Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `src`, representing the output of Transformer encoder. \
                Or a tuple if `cache` is not None, except for encoder output, \
                the tuple includes the new cache which is same as input `cache` \
                argument but `incremental_cache` in it has an incremental length. \
                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                for more details.
        """
        src_mask = _convert_attention_mask(src_mask, src.dtype)

        output = src
        # To get cache from None when use_cache is True, which is compatible with HF
        # while HF requires decoder. The implementation here uses cache update in the
        # MultiHeadAttention not so efficiently, and maybe optimize it later.
        if cache is None and getattr(self, "_use_cache", False):
            cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)
        # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts
        # to True when cache is not None.
        new_caches = [] if cache is not None and getattr(self, "_use_cache",
                                                         True) else None
        all_attentions = [] if output_attentions else None
        # NOTE: Also includes embeding output which is same as HF.
        all_hidden_states = [output] if output_hidden_states else None
        for i, mod in enumerate(self.layers):
            if self.enable_recompute:
                # Note: recompute do not support pass as **kwargs yet.
                layer_outputs = recompute(
                    mod, output, src_mask, None if cache is None else cache[i]
                    if isinstance(cache[i], MultiHeadAttention.Cache) else
                    MultiHeadAttention.Cache(*cache[i]), output_attentions)
            else:
                layer_outputs = mod(
                    output,
                    src_mask=src_mask,
                    cache=None if cache is None else cache[i]
                    if isinstance(cache[i], MultiHeadAttention.Cache) else
                    MultiHeadAttention.Cache(*cache[i]),
                    output_attentions=output_attentions)

            if isinstance(layer_outputs, tuple):
                output = layer_outputs[0]
                outputs = layer_outputs[1:]
            else:
                output = layer_outputs
                outputs = None

            if output_hidden_states:
                all_hidden_states.append(output)
            if output_attentions:
                all_attentions.append(outputs[-1])
            if new_caches is not None:
                new_caches.append(outputs[0] if isinstance(cache[
                    i], MultiHeadAttention.Cache) else (tuple(outputs[0])))

        if self.norm is not None:
            output = self.norm(output)

            if output_hidden_states:
                all_hidden_states[-1] = output

        if not return_dict:
            outputs = tuple(
                tuple(v) if isinstance(v, list) else v
                for v in [
                    output,
                    new_caches,
                    all_hidden_states,
                    all_attentions,
                ] if v is not None)
            if len(outputs) == 1:
                return output
            else:
                return outputs

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=output,
            past_key_values=new_caches,
            hidden_states=all_hidden_states,
            attentions=all_attentions)

    def gen_cache(self, src):
        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is `incremental_cache` produced by
        `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`
        for more details.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, source_length, d_model]`. The data type
                should be float32 or float64.

        Returns:
            list: It is a list, and each element in the list is `incremental_cache`
            produced by `TransformerEncoderLayer.gen_cache`. See
            `TransformerEncoderLayer.gen_cache` for more details.
        """
        cache = [layer.gen_cache(src) for layer in self.layers]
        return cache


================================================
FILE: ppfleetx/models/language_model/ernie/layers/model_outputs.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import paddle
import numpy as np
from collections import OrderedDict
from dataclasses import fields, dataclass
from typing import Any, List, Tuple, Optional
from paddle.nn.layer.transformer import _convert_attention_mask, MultiHeadAttention
from paddle.distributed.fleet.utils import recompute

from .utils import adapt_stale_fwd_patch


def is_tensor(x):
    if isinstance(x, paddle.Tensor):
        return True

    return isinstance(x, np.ndarray)


class ModelOutput(OrderedDict):
    """
    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
    python dictionary.

    <Tip warning={true}>

    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
    before.

    </Tip>
    """

    def __post_init__(self):
        class_fields = fields(self)

        # note(guosheng): Convert list to tuple automatically, and better to
        # check if it is frozen.
        # assert not getattr(self, dataclasses._PARAMS).frozen
        for f in class_fields:
            value = getattr(self, f.name)
            if isinstance(value, list):
                setattr(self, f.name, tuple(value))

        # Safety and consistency checks
        if not len(class_fields):
            raise ValueError(f"{self.__class__.__name__} has no fields.")
        if not all(field.default is None for field in class_fields[1:]):
            raise ValueError(
                f"{self.__class__.__name__} should not have more than one required field."
            )

        first_field = getattr(self, class_fields[0].name)
        other_fields_are_none = all(
            getattr(self, field.name) is None for field in class_fields[1:])

        if other_fields_are_none and not is_tensor(first_field):
            if isinstance(first_field, dict):
                iterator = first_field.items()
                first_field_iterator = True
            else:
                try:
                    iterator = iter(first_field)
                    first_field_iterator = True
                except TypeError:
                    first_field_iterator = False

            # if we provided an iterator as first field and the iterator is a (key, value) iterator
            # set the associated fields
            if first_field_iterator:
                for element in iterator:
                    if (not isinstance(element, (list, tuple)) or
                            not len(element) == 2 or
                            not isinstance(element[0], str)):
                        break
                    setattr(self, element[0], element[1])
                    if element[1] is not None:
                        self[element[0]] = element[1]
            elif first_field is not None:
                self[class_fields[0].name] = first_field
        else:
            for field in class_fields:
                v = getattr(self, field.name)
                if v is not None:
                    self[field.name] = v

    def __delitem__(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
        )

    def setdefault(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
        )

    def pop(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")

    def update(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
        )

    def __getitem__(self, k):
        if isinstance(k, str):
            inner_dict = {k: v for (k, v) in self.items()}
            return inner_dict[k]
        else:
            return self.to_tuple()[k]

    def __setattr__(self, name, value):
        if name in self.keys() and value is not None:
            # Don't call self.__setitem__ to avoid recursion errors
            super().__setitem__(name, value)
        super().__setattr__(name, value)

    def __setitem__(self, key, value):
        # Will raise a KeyException if needed
        super().__setitem__(key, value)
        # Don't call self.__setattr__ to avoid recursion errors
        super().__setattr__(key, value)

    def to_tuple(self) -> Tuple[Any]:
        """
        Convert self to a tuple containing all the attributes/keys that are not `None`.
        """
        return tuple(self[k] for k in self.keys())


@dataclass
class ErnieForPreTrainingOutput(ModelOutput):
    """
    Output type of [`ErnieForPreTraining`].
    Args:
        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss = None
    prediction_logits = None
    seq_relationship_logits = None
    hidden_states = None
    attentions = None


@dataclass
class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

    last_hidden_state: paddle.Tensor = None
    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None
    cross_attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token) after further processing
            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
            the classification token after processing through a linear layer and a tanh activation function. The linear
            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
    """

    last_hidden_state: paddle.Tensor = None
    pooler_output: paddle.Tensor = None
    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None
    cross_attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class SequenceClassifierOutput(ModelOutput):
    """
    Base class for outputs of sentence classification models.

    Args:
        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[paddle.Tensor] = None
    logits: paddle.Tensor = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class TokenClassifierOutput(ModelOutput):
    """
    Base class for outputs of token classification models.

    Args:
        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
            Classification loss.
        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[paddle.Tensor] = None
    logits: paddle.Tensor = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class QuestionAnsweringModelOutput(ModelOutput):
    """
    Base class for outputs of question answering models.

    Args:
        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[paddle.Tensor] = None
    start_logits: paddle.Tensor = None
    end_logits: paddle.Tensor = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class MultipleChoiceModelOutput(ModelOutput):
    """
    Base class for outputs of multiple choice models.

    Args:
        loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
            Classification loss.
        logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):
            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

            Classification scores (before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[paddle.Tensor] = None
    logits: paddle.Tensor = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class MaskedLMOutput(ModelOutput):
    """
    Base class for masked language models outputs.

    Args:
        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[paddle.Tensor] = None
    logits: paddle.Tensor = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None


@dataclass
class CausalLMOutputWithCrossAttentions(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Cross attentions weights after the attention softmax, used to compute the weighted average in the
            cross-attention heads.
        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
            setting. Only relevant if `config.is_decoder = True`.

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
    """

    loss: Optional[paddle.Tensor] = None
    logits: paddle.Tensor = None
    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
    hidden_states: Optional[Tuple[paddle.Tensor]] = None
    attentions: Optional[Tuple[paddle.Tensor]] = None
    cross_attentions: Optional[Tuple[paddle.Tensor]] = None


================================================
FILE: ppfleetx/models/language_model/ernie/layers/transformer.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO: define the classes of Transformer neural network

import copy
import collections
import numpy as np

import paddle
import paddle.nn.functional as F
import paddle.nn as nn

from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer
import paddle.tensor as tensor
from paddle.fluid import layers
from paddle import ParamAttr
from paddle.fluid.data_feeder import convert_dtype
from .model_outputs import BaseModelOutputWithPastAndCrossAttentions
from paddle.distributed.fleet.utils import recompute

__all__ = []


def _convert_param_attr_to_list(param_attr, n):
    """
    If `param_attr` is a list or tuple, convert every element in it to a
    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
    construct a list, and rename every one by appending a increasing index
    suffix to avoid having same names when `param_attr` contains a name.

    Parameters:
        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
            converted to a ParamAttr instance by `ParamAttr._to_attr`.
        n (int): The times to repeat to construct a list when `param_attr`
            is not a list or tuple.

    Returns:
        list: A list composed of each including cell's `param_attr`.
    """
    if isinstance(param_attr, (list, tuple)):
        assert len(param_attr) == n, (
            "length of param_attr should be %d when it is a list/tuple" % n)
        param_attrs = []
        for attr in param_attr:
            if isinstance(attr, bool):
                if attr:
                    param_attrs.append(ParamAttr._to_attr(None))
                else:
                    param_attrs.append(False)
            else:
                param_attrs.append(ParamAttr._to_attr(attr))
        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
    elif isinstance(param_attr, bool):
        param_attrs = []
        if param_attr:
            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
        else:
            param_attrs = [False] * n
    else:
        param_attrs = []
        attr = ParamAttr._to_attr(param_attr)
        for i in range(n):
            attr_i = copy.deepcopy(attr)
            if attr.name:
                attr_i.name = attr_i.name + "_" + str(i)
            param_attrs.append(attr_i)
    return param_attrs


def _convert_attention_mask(attn_mask, dtype):
    """
    Convert the attention mask to the target dtype we expect.

    Parameters:
        attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
        dtype (VarType): The target type of `attn_mask` we expect.

    Returns:
        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
    """
    if attn_mask is not None and attn_mask.dtype != dtype:
        attn_mask_dtype = convert_dtype(attn_mask.dtype)
        if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype:
            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9
        else:
            attn_mask = paddle.cast(attn_mask, dtype)
    return attn_mask


class MultiHeadAttention(Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
    for more details.

    Parameters:
        embed_dim (int): The expected feature size in the input and output.
        num_heads (int): The number of heads in multi-head attention.
        dropout (float, optional): The dropout probability used on attention
            weights to drop some attention targets. 0 for no dropout. Default 0
        kdim (int, optional): The feature size in key. If None, assumed equal to
            `embed_dim`. Default None.
        vdim (int, optional): The feature size in value. If None, assumed equal to
            `embed_dim`. Default None.
        need_weights (bool, optional): Indicate whether to return the attention
            weights. Default False.
        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
            Default: None, which means the default bias parameter property is used.
            If it is set to False, this layer will not have trainable bias parameter.
            See usage for details in :code:`ParamAttr` .

    Examples:

        .. code-block:: python

            import paddle

            # encoder input: [batch_size, sequence_length, d_model]
            query = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, num_heads, query_len, query_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
    """

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 bias_attr=None):
        super(MultiHeadAttention, self).__init__()

        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
                               "but received {}".format(embed_dim))
        assert num_heads > 0, ("Expected num_heads to be greater than 0, "
                               "but received {}".format(num_heads))

        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        self.q_proj = Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.k_proj = Linear(
            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.v_proj = Linear(
            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.out_proj = Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)

    def _prepare_qkv(self, query, key, value, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`.
            value (Tensor): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                It is a namedtuple with `k` and `v` as fields, and stores tensors
                shaped `[batch_size, num_heads, length, embed_dim]` which are results
                of linear projection, reshape and transpose calculations in
                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
                fields reserve intermediate results of previous positions, which
                mostly used for decoder self attention. If it is an instance of
                `StaticCache`, `key` and `value` args would be ignored, `k` and
                `v` fields would be used as calculated results on `key` and
                `value`, which mostly used for decoder-encoder cross attention.
                It is only used for inference and should be None for training.
                Default None.

        Returns:
            tuple: A tuple including linear projected keys and values. These two \
                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
                and `[batch_size, n_head, sequence_length, d_value]` separately, \
                and their data types are same as inputs.
        """
        q = self.q_proj(query)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
            cache = self.Cache(k, v)

        return (q, k, v) if cache is None else (q, k, v, cache)

    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.

        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.

        Parameters:
            key (Tensor): The keys for multi-head attention. It is a tensor
                with shape `[batch_size, sequence_length, kdim]`. The data type
                should be float32 or float64.
            value (Tensor): The values for multi-head attention. It is a tensor
                with shape `[batch_size, sequence_length, vdim]`. The data type
                should be float32 or float64.

        Returns:
            tuple: A tuple including transformed keys and values. Their shapes \
                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
                and their data types are same as inputs.
        """
        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
        return k, v

    def gen_cache(self, key, value=None, type=Cache):
        """
        Generates cache for `forward` usage in inference accroding to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.

        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
        which are results of linear projection, reshape and transpose calculations
        in MultiHeadAttention.

        If the generated cache is an instance of `Cache`, `k` and `v` fields
        reserve intermediate result tensors of previous positions, and the tensors
        are incremental among decoding steps, which mostly are used for decoder
        decoder self attention.

        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
        would be used as calculated result tensors on keys an values in `forward`,
        and the tensors keep unchanged among decoding steps, which are mostly used
        for decoder-encoder cross attention.

        The cache is generated as follows:

        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
        results to create an instance of `StaticCache`.

        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
        to create an instance of `Cache`, where `batch_size` is from the first
        dimension of `key`.

        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
        an instance of `Cache`.

        Parameters:
            key (Tensor): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If `value` is None,
                it is only for batch size and data type reference.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, `key` is only
                for batch size reference. Default None.
            type (type): It should be `MultiHeadAttention.StaticCache` or
                `MultiHeadAttention.Cache` to indicate the cache type to generate.

        Returns:
            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
        """
        if type == MultiHeadAttention.StaticCache:  # static_kv
            k, v = self.compute_kv(key, value)
            return self.StaticCache(k, v)
        elif value is None:  # incremental_state
            k = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            v = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            return self.Cache(k, v)
        else:
            # incremental_state with initial value, mainly for usage like UniLM
            return self.Cache(key, value)

    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.

        Parameters:
            query (Tensor): The queries for multi-head attention. It is a
                tensor with shape `[batch_size, query_length, embed_dim]`. The
                data type should be float32 or float64.
            key (Tensor, optional): The keys for multi-head attention. It is
                a tensor with shape `[batch_size, key_length, kdim]`. The
                data type should be float32 or float64. If None, use `query` as
                `key`. Default None.
            value (Tensor, optional): The values for multi-head attention. It
                is a tensor with shape `[batch_size, value_length, vdim]`.
                The data type should be float32 or float64. If None, use `query` as
                `value`. Default None.
            attn_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                It is a namedtuple with `k` and `v` as fields, and stores tensors
                shaped `[batch_size, num_heads, length, embed_dim]` which are results
                of linear projection, reshape and transpose calculations in
                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
                fields reserve intermediate results of previous positions, which
                mostly used for decoder self attention. If it is an instance of
                `StaticCache`, `key` and `value` args would be ignored, `k` and
                `v` fields would be used as calculated results on `key` and
                `value`, which mostly used for decoder-encoder cross attention.
                It is only used for inference and should be None for training.
                Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `query`, representing attention output. Or a tuple if \
                `need_weights` is True or `cache` is not None. If `need_weights` \
                is True, except for attention output, the tuple also includes \
                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
                If `cache` is not None, the tuple then includes the new cache \
                having the same type as `cache`, and if it is `StaticCache`, it \
                is same as the input `cache`, if it is `Cache`, the new cache \
                reserves tensors concatanating raw tensors with intermediate \
                results of current query.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if cache is None:
            q, k, v = self._prepare_qkv(query, key, value, cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, cache)

        # scale dot product attention
        product = paddle.matmul(
            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
        if attn_mask is not None:
            # Support bool or int mask
            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
            product = product + attn_mask
        weights = F.softmax(product)
        if self.dropout:
            weights = F.dropout(
                weights,
                self.dropout,
                training=self.training,
                mode="upscale_in_train")

        out = paddle.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if cache is not None:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerEncoderLayer(Layer):
    """
    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
    attention and feedforward network. Before and after each sub-layer, pre-process
    and post-precess would be applied on the input and output accordingly. If
    `normalize_before` is True, pre-process is layer normalization and post-precess
    includes dropout, residual connection. Otherwise, no pre-process and post-precess
    includes dropout, residual connection, layer normalization.

    Parameters:
        d_model (int): The expected feature size in the input and output.
        nhead (int): The number of heads in multi-head attention(MHA).
        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
        dropout (float, optional): The dropout probability used in pre-process
            and post-precess of MHA and FFN sub-layer. Default 0.1
        activation (str, optional): The activation function in the feedforward
            network. Default relu.
        attn_dropout (float, optional): The dropout probability used
            in MHA to drop some attention target. If None, use the value of
            `dropout`. Default None
        act_dropout (float, optional): The dropout probability used after FFN
            activition.  If None, use the value of `dropout`. Default None
        normalize_before (bool, optional): Indicate whether to put layer normalization
            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` .
        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
            The `False` value means the corresponding layer would not have trainable
            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
            which means the default bias parameter property is used.


    Examples:

        .. code-block:: python

            import paddle
            from paddle.nn import TransformerEncoderLayer

            # encoder input: [batch_size, src_len, d_model]
            enc_input = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, n_head, src_len, src_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = TransformerEncoderLayer(128, 2, 512)
            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerEncoderLayer, self).__init__()

        assert d_model > 0, ("Expected d_model to be greater than 0, "
                             "but received {}".format(d_model))
        assert nhead > 0, ("Expected nhead to be greater than 0, "
                           "but received {}".format(nhead))
        assert dim_feedforward > 0, (
            "Expected dim_feedforward to be greater than 0, "
            "but received {}".format(dim_feedforward))

        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0])
        self.linear1 = Linear(
            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
        self.linear2 = Linear(
            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

    def forward(self, src, src_mask=None, cache=None, output_attentions=False):
        r"""
        Applies a Transformer encoder layer on the input.

        Parameters:
            src (Tensor): The input of Transformer encoder layer. It is
                a tensor with shape `[batch_size, sequence_length, d_model]`.
                The data type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
                See `TransformerEncoderLayer.gen_cache` for more details. It is
                only used for inference and should be None for training. Default
                None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `enc_input`, representing the output of Transformer encoder \
                layer. Or a tuple if `cache` is not None, except for encoder \
                layer output, the tuple includes the new cache which is same \
                as input `cache` argument but `incremental_cache` has an \
                incremental length. See `MultiHeadAttention.gen_cache` and \
                `MultiHeadAttention.forward` for more details.
        """
        self.self_attn.need_weights = output_attentions
        src_mask = _convert_attention_mask(src_mask, src.dtype)

        residual = src
        if self.normalize_before:
            src = self.norm1(src)

        attn_outputs = self.self_attn(src, src, src, src_mask, cache)
        if isinstance(attn_outputs, tuple):
            src = attn_outputs[0]
            outputs = attn_outputs[1:]
        else:
            src = attn_outputs
            outputs = None

        src = residual + self.dropout1(src)
        if not self.normalize_before:
            src = self.norm1(src)

        residual = src
        if self.normalize_before:
            src = self.norm2(src)
        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = residual + self.dropout2(src)
        if not self.normalize_before:
            src = self.norm2(src)

        return src if outputs is None else (
            (src, ) + outputs[::-1])  # hidden_states, cache, attentions

    def gen_cache(self, src):
        r"""
        Generates cache for `forward` usage. The generated cache is an
        instance of `MultiHeadAttention.Cache`.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, source_length, d_model]`. The data
                type should be float32 or float64.

        Returns:
            incremental_cache: It is an instance of `MultiHeadAttention.Cache` \
                produced by `self_attn.gen_cache`, it reserves two tensors
                shaped `[batch_size, nhead, 0, d_model // nhead]`. See \
                `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                for more details.
        """
        incremental_cache = self.self_attn.gen_cache(
            src, type=self.self_attn.Cache)
        return incremental_cache


class TransformerEncoder(Layer):
    """
    TransformerEncoder is a stack of N encoder layers.

    Parameters:
        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
            would be used as the first layer, and the other layers would be created
            according to the configurations of it.
        num_layers (int): The number of encoder layers to be stacked.
        norm (LayerNorm, optional): the layer normalization component. If provided,
            apply layer normalization on the output of last encoder layer.

    Examples:

        .. code-block:: python

            import paddle
            from paddle.nn import TransformerEncoderLayer, TransformerEncoder

            # encoder input: [batch_size, src_len, d_model]
            enc_input = paddle.rand((2, 4, 128))
            # self attention mask: [batch_size, n_head, src_len, src_len]
            attn_mask = paddle.rand((2, 2, 4, 4))
            encoder_layer = TransformerEncoderLayer(128, 2, 512)
            encoder = TransformerEncoder(encoder_layer, 2)
            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
    """

    def __init__(self,
                 encoder_layer,
                 num_layers,
                 norm=None,
                 enable_recompute=False):
        super(TransformerEncoder, self).__init__()
        self.layers = LayerList([(encoder_layer if i == 0 else
                                  type(encoder_layer)(**encoder_layer._config))
                                 for i in range(num_layers)])
        self.num_layers = num_layers
        self.norm = norm
        self.enable_recompute = enable_recompute

    def forward(self,
                src,
                src_mask=None,
                cache=None,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=False):
        r"""
        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last encoder
        layer.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, sequence_length, d_model]`. The data
                type should be float32 or float64.
            src_mask (Tensor, optional): A tensor used in multi-head attention
                to prevents attention to some unwanted positions, usually the
                paddings or the subsequent positions. It is a tensor with shape
                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
                When the data type is bool, the unwanted positions have `False`
                values and the others have `True` values. When the data type is
                int, the unwanted positions have 0 values and the others have 1
                values. When the data type is float, the unwanted positions have
                `-INF` values and the others have 0 values. It can be None when
                nothing wanted or needed to be prevented attention to. Default None.
            cache (list, optional): It is a list, and each element in the list
                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.
                See `TransformerEncoder.gen_cache` for more details. It is only
                used for inference and should be None for training. Default None.

        Returns:
            Tensor|tuple: It is a tensor that has the same shape and data type \
                as `src`, representing the output of Transformer encoder. \
                Or a tuple if `cache` is not None, except for encoder output, \
                the tuple includes the new cache which is same as input `cache` \
                argument but `incremental_cache` in it has an incremental length. \
                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                for more details.
        """
        src_mask = _convert_attention_mask(src_mask, src.dtype)

        output = src
        # To get cache from None when use_cache is True, which is compatible with HF
        # while HF requires decoder. The implementation here uses cache update in the
        # MultiHeadAttention not so efficiently, and maybe optimize it later.
        if cache is None and getattr(self, "_use_cache", False):
            cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)
        # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts
        # to True when cache is not None.
        new_caches = [] if cache is not None and getattr(self, "_use_cache",
                                                         True) else None
        all_attentions = [] if output_attentions else None
        # NOTE: Also includes embeding output which is same as HF.
        all_hidden_states = [output] if output_hidden_states else None
        for i, mod in enumerate(self.layers):
            if self.enable_recompute:
                # Note: recompute do not support pass as **kwargs yet.
                layer_outputs = recompute(
                    mod, output, src_mask, None if cache is None else cache[i]
                    if isinstance(cache[i], MultiHeadAttention.Cache) else
                    MultiHeadAttention.Cache(*cache[i]), output_attentions)
            else:
                layer_outputs = mod(
                    output,
                    src_mask=src_mask,
                    cache=None if cache is None else cache[i]
                    if isinstance(cache[i], MultiHeadAttention.Cache) else
                    MultiHeadAttention.Cache(*cache[i]),
                    output_attentions=output_attentions)

            if isinstance(layer_outputs, tuple):
                output = layer_outputs[0]
                outputs = layer_outputs[1:]
            else:
                output = layer_outputs
                outputs = None

            if output_hidden_states:
                all_hidden_states.append(output)
            if output_attentions:
                all_attentions.append(outputs[-1])
            if new_caches is not None:
                new_caches.append(outputs[0] if isinstance(cache[
                    i], MultiHeadAttention.Cache) else (tuple(outputs[0])))

        if self.norm is not None:
            output = self.norm(output)

            if output_hidden_states:
                all_hidden_states[-1] = output

        if not return_dict:
            return output

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=output,
            past_key_values=new_caches,
            hidden_states=all_hidden_states,
            attentions=all_attentions)

    def gen_cache(self, src):
        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is `incremental_cache` produced by
        `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`
        for more details.

        Parameters:
            src (Tensor): The input of Transformer encoder. It is a tensor
                with shape `[batch_size, source_length, d_model]`. The data type
                should be float32 or float64.

        Returns:
            list: It is a list, and each element in the list is `incremental_cache`
            produced by `TransformerEncoderLayer.gen_cache`. See
            `TransformerEncoderLayer.gen_cache` for more details.
        """
        cache = [layer.gen_cache(src) for layer in self.layers]
        return cache


================================================
FILE: ppfleetx/models/language_model/ernie/layers/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import inspect
import warnings

import paddle
from paddle.nn import Layer


def fn_args_to_dict(func, *args, **kwargs):
    """
    Inspect function `func` and its arguments for running, and extract a
    dict mapping between argument names and keys. 
    """
    if hasattr(inspect, 'getfullargspec'):
        (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _,
         _) = inspect.getfullargspec(func)
    else:
        (spec_args, spec_varargs, spec_varkw,
         spec_defaults) = inspect.getargspec(func)
    # add positional argument values
    init_dict = dict(zip(spec_args, args))
    # add default argument values
    kwargs_dict = dict(zip(spec_args[-len(spec_defaults):],
                           spec_defaults)) if spec_defaults else {}
    for k in list(kwargs_dict.keys()):
        if k in init_dict:
            kwargs_dict.pop(k)
    kwargs_dict.update(kwargs)
    init_dict.update(kwargs_dict)
    return init_dict


def adapt_stale_fwd_patch(self, name, value):
    """
    Since there are some monkey patches for forward of PretrainedModel, such as
    model compression, we make these patches compatible with the latest forward
    method.
    """
    if name == "forward":
        # NOTE(guosheng): In dygraph to static, `layer.forward` would be patched
        # by an instance of `StaticFunction`. And use string compare to avoid to
        # import fluid.
        if type(value).__name__.endswith('StaticFunction'):
            return value
        if hasattr(inspect, 'getfullargspec'):
            (patch_spec_args, patch_spec_varargs, patch_spec_varkw,
             patch_spec_defaults, _, _, _) = inspect.getfullargspec(value)
            (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _,
             _) = inspect.getfullargspec(self.forward)
        else:
            (patch_spec_args, patch_spec_varargs, patch_spec_varkw,
             patch_spec_defaults) = inspect.getargspec(value)
            (spec_args, spec_varargs, spec_varkw,
             spec_defaults) = inspect.getargspec(self.forward)
        new_args = [
            arg
            for arg in ('output_hidden_states', 'output_attentions',
                        'return_dict')
            if arg not in patch_spec_args and arg in spec_args
        ]

        if new_args:
            if self.__module__.startswith("paddlenlp"):
                warnings.warn(
                    f"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} is patched and the patch "
                    "might be based on an old oversion which missing some "
                    f"arguments compared with the latest, such as {new_args}. "
                    "We automatically add compatibility on the patch for "
                    "these arguemnts, and maybe the patch should be updated.")
            else:
                warnings.warn(
                    f"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} "
                    "is patched and the patch might be conflict with patches made "
                    f"by paddlenlp which seems have more arguments such as {new_args}. "
                    "We automatically add compatibility on the patch for "
                    "these arguemnts, and maybe the patch should be updated.")
            if isinstance(self, Layer) and inspect.isfunction(value):

                @functools.wraps(value)
                def wrap_fwd(*args, **kwargs):
                    for arg in new_args:
                        kwargs.pop(arg, None)
                    return value(self, *args, **kwargs)
            else:

                @functools.wraps(value)
                def wrap_fwd(*args, **kwargs):
                    for arg in new_args:
                        kwargs.pop(arg, None)
                    return value(*args, **kwargs)

            return wrap_fwd
    return value


class InitTrackerMeta(type(Layer)):
    """
    This metaclass wraps the `__init__` method of a class to add `init_config`
    attribute for instances of that class, and `init_config` use a dict to track
    the initial configuration. If the class has `_pre_init` or `_post_init`
    method, it would be hooked before or after `__init__` and called as
    `_pre_init(self, init_fn, init_args)` or `_post_init(self, init_fn, init_args)`.
    Since InitTrackerMeta would be used as metaclass for pretrained model classes,
    which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`
    rather than `type` as base class for it to avoid inheritance metaclass
    conflicts.
    """

    def __init__(cls, name, bases, attrs):
        init_func = cls.__init__
        # If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`.
        # Otherwise, no need to wrap again since the super cls has been wraped.
        # TODO: remove reduplicated tracker if using super cls `__init__`
        pre_init_func = getattr(cls, '_pre_init',
                                None) if '__init__' in attrs else None
        post_init_func = getattr(cls, '_post_init',
                                 None) if '__init__' in attrs else None
        cls.__init__ = InitTrackerMeta.init_and_track_conf(
            init_func, pre_init_func, post_init_func)
        super(InitTrackerMeta, cls).__init__(name, bases, attrs)

    @staticmethod
    def init_and_track_conf(init_func, pre_init_func=None,
                            post_init_func=None):
        """
        wraps `init_func` which is `__init__` method of a class to add `init_config`
        attribute for instances of that class.
        Args:
            init_func (callable): It should be the `__init__` method of a class.
            pre_init_func (callable, optional): If provided, it would be hooked after
                `init_func` and called as `pre_init_func(self, init_func, *init_args, **init_args)`.
                Default None.
            post_init_func (callable, optional): If provided, it would be hooked after
                `init_func` and called as `post_init_func(self, init_func, *init_args, **init_args)`.
                Default None.
        
        Returns:
            function: the wrapped function
        """

        @functools.wraps(init_func)
        def __impl__(self, *args, **kwargs):
            # registed helper by `pre_init_func`
            if pre_init_func:
                pre_init_func(self, init_func, *args, **kwargs)
            # keep full configuration
            init_func(self, *args, **kwargs)
            # registed helper by `post_init_func`
            if post_init_func:
                post_init_func(self, init_func, *args, **kwargs)
            self.init_config = kwargs
            if args:
                kwargs['init_args'] = args
            kwargs['init_class'] = self.__class__.__name__

        return __impl__

    def __setattr__(self, name, value):
        value = adapt_stale_fwd_patch(self, name, value)
        return super(InitTrackerMeta, self).__setattr__(name, value)


================================================
FILE: ppfleetx/models/language_model/gpt/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .dygraph.hybrid_model import (
    GPTModelHybrid, GPTForPretrainingPipe, GPTPretrainingCriterionHybird,
    GPTForPretrainingHybrid, GPTForGenerationHybrid)
from .auto.auto_model import (GPTModelAuto, GPTForPretrainingAuto,
                              GPTPretrainingCriterionAuto,
                              GPTForGenerationAuto)

from .dygraph.single_model import GPTForPretraining, GPTPretrainingCriterion, GPTModel, GPTForGeneration, GPTForSequenceClassification


================================================
FILE: ppfleetx/models/language_model/gpt/auto/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/gpt/auto/auto_model.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.tensor as tensor
import paddle.incubate as incubate
import paddle.distributed.auto_parallel as auto

from paddle.fluid import layers
from paddle.common_ops_import import convert_dtype
from paddle.nn.layer.transformer import _convert_param_attr_to_list
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
from ..dygraph.processor import (
    LogitsProcessorList, MinLengthLogitsProcessor,
    HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor,
    ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor)


class MultiHeadAttention(nn.Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    """

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 bias_attr=None,
                 fuse_attn_qkv=False,
                 use_recompute=False,
                 recompute_granularity="full",
                 mesh=None,
                 mesh_idx=None):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights
        self.fuse_attn_qkv = fuse_attn_qkv
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.mesh = mesh
        self.mesh_idx = mesh_idx

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        if self.fuse_attn_qkv:
            assert self.kdim == embed_dim
            assert self.vdim == embed_dim
            self.qkv_proj = nn.Linear(
                embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)
        else:
            self.q_proj = nn.Linear(
                embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
            self.k_proj = nn.Linear(
                self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
            self.v_proj = nn.Linear(
                self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
        self.out_proj = nn.Linear(
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)

    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):
        auto.shard_tensor(self.qkv_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])

        mix_layer = self.qkv_proj(query)
        mix_layer = paddle.reshape_(mix_layer,
                                    [0, 0, self.num_heads, 3 * self.head_dim])
        mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3])
        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)

        assert not isinstance(
            cache, self.StaticCache
        ), "cache currently does not support the StaticCache type"

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v) if use_cache is False else (q, k, v, cache)

    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.

        """
        auto.shard_tensor(self.q_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])

        q = self.q_proj(query)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=2)
            v = tensor.concat([cache.v, v], axis=2)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v) if use_cache is False else (q, k, v, cache)

    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.

        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.

        """
        auto.shard_tensor(self.k_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])
        auto.shard_tensor(self.v_proj.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])

        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
        return k, v

    def gen_cache(self, key, value=None, type=Cache):
        """
        Generates cache for `forward` usage in inference accroding to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.
        """
        if type == MultiHeadAttention.StaticCache:  # static_kv
            k, v = self.compute_kv(key, value)
            return self.StaticCache(k, v)
        elif value is None:  # incremental_state
            k = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            v = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            return self.Cache(k, v)
        else:
            # incremental_state with initial value, mainly for usage like UniLM
            return self.Cache(key, value)

    def core_attn(self, q, k, v, attn_mask=None):
        # scale dot product attention
        product = paddle.matmul(
            x=q, y=k, transpose_y=True) * self.head_dim**-0.5

        if attn_mask is not None:
            product = product + attn_mask
            weights = F.softmax(product)
        else:
            weights = incubate.softmax_mask_fuse_upper_triangle(product)

        if self.dropout:
            # with get_rng_state_tracker().rng_state('local_seed'):
            weights = F.dropout(
                weights,
                self.dropout,
                training=self.training,
                mode="upscale_in_train")

        out = paddle.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        return out, weights

    def forward(self,
                query,
                key,
                value,
                attn_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if use_cache is False:
            if self.fuse_attn_qkv:
                q, k, v = self._fuse_prepare_qkv(query, use_cache, cache)
            else:
                q, k, v = self._prepare_qkv(query, key, value, use_cache,
                                            cache)
        else:
            if self.fuse_attn_qkv:
                q, k, v, cache = self._fuse_prepare_qkv(query, use_cache,
                                                        cache)
            else:
                q, k, v, cache = self._prepare_qkv(query, key, value,
                                                   use_cache, cache)

        if self.use_recompute and self.recompute_granularity == "core_attn":
            out, weights = auto.recompute(self.core_attn)(q,
                                                          k,
                                                          v,
                                                          attn_mask=attn_mask)
        else:
            out, weights = self.core_attn(q, k, v, attn_mask=attn_mask)

        auto.shard_tensor(self.out_proj.weight, self.mesh[self.mesh_idx],
                          [self.mesh.mp, None])

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if use_cache:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerDecoder(nn.Layer):
    """
    TransformerDecoder is a stack of N decoder layers.
    """

    def __init__(self,
                 decoder_layers,
                 num_layers,
                 norm=None,
                 hidden_size=None,
                 use_recompute=False,
                 recompute_granularity="full"):
        super(TransformerDecoder, self).__init__()

        self.num_layers = num_layers
        self.layers = decoder_layers
        self.norm = norm
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        if norm == "LayerNorm":
            self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
        elif norm is not None:
            raise ValueError("Only support LayerNorm")

    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last decoder
        layer.
        """
        output = tgt
        new_caches = []

        for i, mod in enumerate(self.layers):
            auto.shard_tensor(
                output, mod.mesh[mod.mesh_idx],
                [mod.mesh.dp] + [None for i in range(len(output.shape) - 1)])

            if cache is None:
                if use_cache:
                    output, new_cache = mod(output,
                                            memory,
                                            tgt_mask=tgt_mask,
                                            use_cache=use_cache,
                                            cache=cache)
                    new_caches.append(new_cache)
                else:
                    if self.use_recompute and self.recompute_granularity == "full":
                        output = auto.recompute(mod)(output, memory, tgt_mask,
                                                     use_cache, cache)
                    else:
                        output = mod(output, memory, tgt_mask, use_cache,
                                     cache)
            else:
                output, new_cache = mod(output,
                                        memory,
                                        tgt_mask=tgt_mask,
                                        use_cache=use_cache,
                                        cache=cache[i])
                new_caches.append(new_cache)

        if self.norm is not None:
            output = self.norm(output)
        return output if use_cache is False else (output, new_caches)

    def gen_cache(self, memory, do_zip=False):
        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
        for more details. If `do_zip` is True, apply `zip` on these tuples to get
        a list with two elements.
       """
        cache = [layer.gen_cache(memory) for layer in self.layers]
        if do_zip:
            cache = list(zip(*cache))
        return cache


class TransformerDecoderLayer(nn.Layer):
    """
    The transformer decoder layer.

    It contains multiheadattention and some linear layers.
    """

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 weight_attr=None,
                 bias_attr=None,
                 fuse_attn_qkv=False,
                 use_recompute=False,
                 recompute_granularity="full",
                 mesh=None,
                 mesh_idx=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.mesh = mesh
        self.mesh_idx = mesh_idx

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0],
            fuse_attn_qkv=fuse_attn_qkv,
            use_recompute=use_recompute,
            recompute_granularity=recompute_granularity,
            mesh=mesh,
            mesh_idx=mesh_idx)

        self.linear1 = nn.Linear(
            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])

        self.linear2 = nn.Linear(
            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])

        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):

        auto.shard_tensor(self.linear1.weight, self.mesh[self.mesh_idx],
                          [None, self.mesh.mp])
        auto.shard_tensor(self.linear2.weight, self.mesh[self.mesh_idx],
                          [self.mesh.mp, None])

        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            if self.use_recompute and self.recompute_granularity == "full_attn":
                tgt = auto.recompute(self.self_attn)(tgt, tgt, tgt, tgt_mask,
                                                     use_cache, cache)
            else:
                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)

        # with get_rng_state_tracker().rng_state('global_seed'):
        tgt = residual + self.dropout1(tgt)

        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)

        # with get_rng_state_tracker().rng_state('global_seed'):
        tgt = self.dropout2(
            self.linear2(F.gelu(
                self.linear1(tgt), approximate=True)))

        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)

    def gen_cache(self, memory):
        incremental_cache = self.self_attn.gen_cache(
            memory, type=self.self_attn.Cache)
        return incremental_cache


class GPTEmbeddings(nn.Layer):
    """
    Include embeddings from word and position embeddings.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 hidden_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 mesh=None):
        super(GPTEmbeddings, self).__init__()
        self.mesh = mesh

        self.word_embeddings = nn.Embedding(
            vocab_size,
            hidden_size,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
                mean=0.0, std=initializer_range)))

        self.position_embeddings = nn.Embedding(
            max_position_embeddings,
            hidden_size,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
                mean=0.0, std=initializer_range)))

        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, position_ids=None):
        if position_ids is None:
            ones = paddle.ones_like(input_ids, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=-1)
            position_ids = seq_length - ones

        auto.shard_tensor(self.word_embeddings.weight, self.mesh[0],
                          [self.mesh.mp, None])

        input_embedings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embedings + position_embeddings
        embeddings = self.dropout(embeddings)
        return embeddings


class GPTModelAuto(nn.Layer):
    def __init__(self,
                 vocab_size=51200,
                 hidden_size=768,
                 num_layers=12,
                 num_attention_heads=12,
                 ffn_hidden_size=3072,
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 fuse_attn_qkv=False,
                 use_recompute=False,
                 recompute_granularity="full",
                 mesh=None):

        super(GPTModelAuto, self).__init__()

        self.initializer_range = initializer_range
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity

        if not mesh:
            raise RuntimeError(
                "AutoPrallel modeling need `mesh` to annotate distributed attribute."
            )
        self.mesh = mesh

        self.embeddings = GPTEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, self.initializer_range,
            self.mesh)

        stages = self.mesh.stages(num_layers)
        decoder_layers = nn.LayerList()
        for i in range(num_layers):
            decoder_layers.append(
                TransformerDecoderLayer(
                    d_model=hidden_size,
                    nhead=num_attention_heads,
                    dim_feedforward=ffn_hidden_size,
                    dropout=hidden_dropout_prob,
                    activation="gelu",
                    attn_dropout=attention_probs_dropout_prob,
                    act_dropout=hidden_dropout_prob,
                    weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
                            mean=0.0, std=self.initializer_range)),
                    bias_attr=None,
                    fuse_attn_qkv=fuse_attn_qkv,
                    use_recompute=use_recompute,
                    recompute_granularity=recompute_granularity,
                    mesh=self.mesh,
                    mesh_idx=stages[i]))

        self.decoder = TransformerDecoder(
            decoder_layers,
            num_layers,
            norm="LayerNorm",
            hidden_size=hidden_size,
            use_recompute=use_recompute,
            recompute_granularity=recompute_granularity)

    def forward(self,
                input_ids,
                position_ids=None,
                attention_mask=None,
                use_cache=False,
                cache=None):
        if position_ids is None:
            past_length = 0
            if cache is not None:
                past_length = paddle.shape(attention_mask)[-1] - 1
            position_ids = paddle.arange(
                past_length,
                paddle.shape(input_ids)[-1] + past_length,
                dtype=input_ids.dtype)
            position_ids = position_ids.unsqueeze(0)
            # .expand_as(input_ids)
            position_ids = paddle.expand_as(position_ids, input_ids)

        input_ids.stop_gradient = True
        position_ids.stop_gradient = True
        auto.shard_tensor(
            input_ids, self.mesh[0],
            [self.mesh.dp] + [None for i in range(len(input_ids.shape) - 1)])

        embedding_output = self.embeddings(
            input_ids=input_ids, position_ids=position_ids)

        if self.training == False:
            # TODO, use registered buffer
            causal_mask = paddle.tensor.triu(
                paddle.ones(
                    (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1]))
                * -1e4,
                diagonal=1)
            if attention_mask is not None:
                if len(attention_mask.shape) == 2:
                    attention_mask = attention_mask[:, None, None, :]
                attention_mask = attention_mask + causal_mask
            else:
                attention_mask = causal_mask
            # The tensor returned by triu not in static graph.
            attention_mask.stop_gradient = True

        encoder_outputs = self.decoder(
            embedding_output,
            memory=None,
            tgt_mask=None if self.training else
            attention_mask,  # use softmax_mask_fuse_upper_triangle
            use_cache=use_cache,
            cache=cache)
        return encoder_outputs


class GPTForPretrainingAuto(nn.Layer):
    """
    GPT Model with pretraining tasks on top.

    Args:
        gpt (:class:`GPTModel`):
            An instance of :class:`GPTModel`.

    """

    def __init__(self, gpt):
        super(GPTForPretrainingAuto, self).__init__()
        self.gpt = gpt

    def forward(self,
                input_ids,
                position_ids=None,
                attention_mask=None,
                masked_positions=None,
                use_cache=False,
                cache=None):

        outputs = self.gpt(input_ids,
                           position_ids=position_ids,
                           attention_mask=attention_mask,
                           use_cache=use_cache,
                           cache=cache)
        if use_cache:
            encoder_outputs, cached_kvs = outputs[:2]
        else:
            encoder_outputs = outputs

        x_dims_mapping = [self.gpt.mesh.dp] + [
            None for i in range(len(encoder_outputs.shape) - 1)
        ]
        w_dims_mapping = [self.gpt.mesh.mp, None]
        matmul = auto.shard_op(paddle.matmul, self.gpt.mesh[-1],
                               [x_dims_mapping, w_dims_mapping, None])
        logits = matmul(
            encoder_outputs,
            self.gpt.embeddings.word_embeddings.weight,
            transpose_y=True)

        if use_cache:
            return logits, cached_kvs
        else:
            return logits


class GPTPretrainingCriterionAuto(nn.Layer):
    """
    Criterion for GPT. It calculates the final loss.
    """

    def __init__(self, mesh):
        super(GPTPretrainingCriterionAuto, self).__init__()
        self.mesh = mesh
        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")

    def forward(self, prediction_scores, masked_lm_labels, loss_mask):
        """
        Args:
            prediction_scores(Tensor):
                The logits of masked token prediction. Its data type should be float32 and
                its shape is [batch_size, sequence_length, vocab_size].
            masked_lm_labels(Tensor):
                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`
                is equal to `prediction_scores`. Its data type should be int64 and
                its shape is [batch_size, sequence_length, 1].
            loss_mask(Tensor):
                Mask used for calculating the loss of the masked language modeling to avoid
                calculating some unwanted tokens.
                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].

        Returns:
            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].

        """
        masked_lm_labels.stop_gradient = True
        loss_mask.stop_gradient = True
        auto.shard_tensor(
            loss_mask, self.mesh[-1],
            [self.mesh.dp] + [None for i in range(len(loss_mask.shape) - 1)])

        masked_lm_loss = self.loss_func(prediction_scores,
                                        masked_lm_labels.unsqueeze(2))

        loss_mask = loss_mask.reshape([-1])
        masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
        loss = masked_lm_loss / loss_mask.sum()
        return loss


class GPTForGenerationAuto(nn.Layer):
    """
    GPT Model with pretraining tasks on top.

    Args:
        gpt (:class:`GPTModel`):
            An instance of :class:`GPTModel`.

    """

    def __init__(self, gpt, configs):
        super(GPTForGenerationAuto, self).__init__()
        self.gpt = gpt
        self.configs = configs

        self.max_length = self.configs.get('max_dec_len', 20)
        self.min_length = self.configs.get('min_dec_len', 0)
        self.decode_strategy = self.configs.get('decode_strategy', 'sampling')
        self.early_finish = self.configs.get('early_finish', True)
        self.temperature = self.configs.get('temperature', 1.0)
        self.top_k = self.configs.get('top_k', 0)
        self.top_p = self.configs.get('top_p', 1.0)
        self.use_topp_sampling = self.configs.get('use_topp_sampling', False)
        self.inference = self.configs.get('inference', False)
        self.repetition_penalty = self.configs.get('repetition_penalty', 1.0)
        self.num_beams = self.configs.get('num_beams', 1)
        self.num_beam_groups = self.configs.get('num_beam_groups', 1)
        self.length_penalty = self.configs.get('length_penalty', 0.0)
        self.early_stopping = self.configs.get('early_stopping', False)
        self.bos_token_id = self.configs.get('bos_token_id', None)
        self.eos_token_id = self.configs.get('eos_token_id', None)
        self.pad_token_id = self.configs.get('pad_token_id', None)
        self.decoder_start_token_id = self.configs.get(
            'decoder_start_token_id', None)
        self.forced_bos_token_id = self.configs.get('forced_bos_token_id',
                                                    None)
        self.forced_eos_token_id = self.configs.get('forced_eos_token_id',
                                                    None)
        self.num_return_sequences = self.configs.get('num_return_sequences', 1)
        self.diversity_rate = self.configs.get('diversity_rate', 0.0)
        self.use_cache = self.configs.get('use_cache', True)

    def prepare_input_ids_for_generation(self,
                                         bos_token_id,
                                         encoder_output=None):
        batch_size = 1
        if bos_token_id is None:
            raise ValueError("`bos_token_id` should be defined when no "
                             "`input_ids` are provided.")
        if encoder_output is not None:
            batch_size = encoder_output.shape[0]
        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id

    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id,
                                              eos_token_id):
        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(
            input_ids == pad_token_id).numpy().item()
        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
            (eos_token_id is not None) and (pad_token_id != eos_token_id))
        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
            attention_mask = (input_ids == pad_token_id
                              ).astype(paddle.get_default_dtype()) * -1e9
        else:
            attention_mask = paddle.zeros_like(
                input_ids, dtype=paddle.get_default_dtype())
        return paddle.unsqueeze(attention_mask, axis=[1, 2])

    def update_scores_for_generation(self, scores, next_scores, length,
                                     unfinished_flag):
        # update scores

        unfinished_scores = (scores * length + next_scores) / (length + 1)
        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
        return scores

    def get_logits_processor(self,
                             min_length=None,
                             max_length=None,
                             eos_token_id=None,
                             forced_bos_token_id=None,
                             forced_eos_token_id=None,
                             num_beams=1,
                             num_beam_groups=1,
                             diversity_rate=0.0,
                             repetition_penalty=None):
        processors = LogitsProcessorList()

        if min_length is not None and eos_token_id is not None and min_length > -1:
            processors.append(
                MinLengthLogitsProcessor(min_length, eos_token_id))
        if num_beam_groups > 1 and diversity_rate > 0.0:
            processors.append(
                HammingDiversityLogitsProcessor(
                    diversity_rate=diversity_rate,
                    num_beams=num_beams,
                    num_beam_groups=num_beam_groups))
        if repetition_penalty is not None and repetition_penalty != 1.0:
            processors.append(
                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
        if forced_bos_token_id is not None:
            processors.append(
                ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
        if forced_eos_token_id is not None:
            processors.append(
                ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
        # TODO
        # Add more pre_processing for distribution

        return processors

    def expand_inputs_for_generation(self,
                                     input_ids,
                                     expand_size,
                                     attention_mask=None,
                                     **model_kwargs):

        index = paddle.tile(
            paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1),
            [1, expand_size]).reshape([-1])

        input_ids = paddle.gather(input_ids, index)

        if attention_mask is not None:
            model_kwargs["attention_mask"] = paddle.gather(attention_mask,
                                                           index)

        if "token_type_ids" in model_kwargs and model_kwargs[
                "token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = paddle.gather(token_type_ids,
                                                           index)

        if "position_ids" in model_kwargs and model_kwargs[
                "position_ids"] is not None:
            position_ids = model_kwargs["position_ids"]
            model_kwargs["position_ids"] = paddle.gather(position_ids, index)

        if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None:
            seq_len = model_kwargs["seq_len"]
            model_kwargs["seq_len"] = paddle.gather(seq_len, index)

        if "encoder_output" in model_kwargs and model_kwargs[
                "encoder_output"] is not None:
            encoder_output = model_kwargs["encoder_output"]
            model_kwargs["encoder_output"] = paddle.gather(encoder_output,
                                                           index)

        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = paddle.gather(role_ids, index)

        return input_ids, model_kwargs

    def prepare_inputs_for_generation(self,
                                      input_ids,
                                      use_cache=False,
                                      cache=None,
                                      **kwargs):
        # only last token for inputs_ids if cache is defined in kwargs
        position_ids = kwargs.get("position_ids", None)
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            if len(attention_mask.shape) == 4:
                attention_mask = attention_mask[:, -1, -1, :]
            if "int" in paddle.common_ops_import.convert_dtype(
                    attention_mask.dtype):
                attention_mask = (1.0 - attention_mask) * -1e4
        return {
            "input_ids": input_ids,
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "cache": cache
        }

    def update_model_kwargs_for_generation(self,
                                           next_tokens,
                                           outputs,
                                           model_kwargs,
                                           is_encoder_decoder=False):
        # Update the model inputs during generation.
        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
        # and they contain pad value, the result vectors updated by this method
        # may be different from expected. In this case, you need to rewrite the
        # method.

        # update cache
        if isinstance(outputs, tuple):
            model_kwargs["cache"] = outputs[1]

        # update token_type_ids with last value
        if "token_type_ids" in model_kwargs and model_kwargs[
                "token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = paddle.concat(
                [token_type_ids, token_type_ids[:, -1:]], axis=-1)

        # update position_ids
        if "position_ids" in model_kwargs and model_kwargs[
                "position_ids"] is not None:
            position_ids = model_kwargs["position_ids"]
            model_kwargs["position_ids"] = position_ids[:, -1:] + 1

        # update attention_mask
        if not is_encoder_decoder and "attention_mask" in model_kwargs:
            attention_mask = model_kwargs["attention_mask"]
            # nn.Pad2D don't support the data type `bool`
            if convert_dtype(attention_mask.dtype) == 'bool':
                attention_mask = paddle.cast(attention_mask, 'int64')
            if len(attention_mask.shape) == 4:
                attention_mask = nn.Pad2D(
                    [0, 0, 0, 1], mode='replicate')(attention_mask)
                attention_mask = nn.Pad2D(
                    [0, 1, 0, 0], value=-1e4)(attention_mask)
                dtype = convert_dtype(attention_mask.dtype)
                if 'int' in dtype:
                    attention_mask[:, :, -1, -1] = 1
                elif 'float' in dtype:
                    attention_mask[:, :, -1, -1] = 0.0
                else:
                    raise ValueError(
                        'The data type of input `attention_mask` must '
                        'be bool, int or float')
            else:
                attention_mask = paddle.concat(
                    [
                        attention_mask, paddle.ones(
                            [attention_mask.shape[0], 1], dtype="int64")
                    ],
                    axis=-1)
            model_kwargs["attention_mask"] = attention_mask

        # update role_ids
        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = paddle.concat(
                [role_ids, role_ids[:, -1:]], axis=-1)

        model_kwargs['res'] = paddle.concat(
            [model_kwargs['res'], next_tokens], axis=1)

        return model_kwargs

    def sample(self,
               input_ids,
               logits_processors,
               max_length,
               pad_token_id,
               eos_token_id,
               top_k=None,
               top_p=None,
               temperature=None,
               min_tokens_to_keep=1,
               **model_kwargs):
        def TopKProcess(probs, top_k, min_tokens_to_keep):
            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
            # Remove all tokens with a probability less than the last token of the top-k
            topk_probs, _ = paddle.topk(probs, k=top_k)
            probs = paddle.where(probs >= topk_probs[:, -1:], probs,
                                 paddle.full_like(probs, 0.0))
            return probs

        def TopPProcess(probs, top_p, min_tokens_to_keep):
            sorted_probs = paddle.sort(probs, descending=True)
            sorted_indices = paddle.argsort(probs, descending=True)
            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)

            # Remove tokens with cumulative probs above the top_p, But keep at
            # least min_tokens_to_keep tokens
            sorted_indices_to_remove = cumulative_probs > top_p
            if min_tokens_to_keep > 1:
                # Set 'min_tokens_to_keep - 1' because the first token is kept
                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0
            # Keep the first token
            sorted_indices_to_remove = paddle.cast(
                sorted_indices_to_remove, dtype='int64')
            sorted_indices_to_remove[:, 1:] = (
                sorted_indices_to_remove[:, :-1].clone())
            sorted_indices_to_remove[:, 0] = 0

            # Scatter sorted tensors to original indexing
            sorted_indices = sorted_indices + paddle.arange(probs.shape[
                0]).unsqueeze(-1) * probs.shape[-1]
            condition = paddle.scatter(sorted_indices_to_remove.flatten(),
                                       sorted_indices.flatten(),
                                       sorted_indices_to_remove.flatten())
            condition = paddle.cast(condition, 'bool').reshape(probs.shape)
            probs = paddle.where(condition,
                                 paddle.full_like(probs, 0.0), probs)
            return probs

        batch_size, cur_len = paddle.shape(input_ids)
        # used for compute on gpu, avoid memcpy D2H
        cur_len_gpu = paddle.full([1], cur_len, dtype='int64')

        origin_len = paddle.shape(input_ids)[1]
        # used for compute on gpu, avoid memcpy D2H
        origin_len_gpu = paddle.full([1], origin_len, dtype='int64')

        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
        scores = paddle.full(
            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())

        res = paddle.assign(input_ids)
        model_kwargs['res'] = res

        # use_cache is immutable, we split it off other mutable kwargs.
        assert 'use_cache' in model_kwargs
        immutable = {'use_cache': model_kwargs['use_cache']}
        del model_kwargs['use_cache']

        def _forward_(**args):
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **args, **immutable)
            return self.gpt(**model_inputs, **immutable)

        def _post_process_(outputs, input_ids, cur_len, origin_len, scores,
                           unfinished_flag, model_kwargs):

            logits = outputs[0] if isinstance(outputs, tuple) else outputs

            # logits = paddle.matmul(
            #     logits,
            #     self.gpt.embeddings.word_embeddings.weight,
            #     transpose_y=True)

            x_dims_mapping = [self.gpt.mesh.dp] + [
                None for i in range(len(logits.shape) - 1)
            ]
            w_dims_mapping = [self.gpt.mesh.mp, None]
            matmul = auto.shard_op(paddle.matmul, self.gpt.mesh[-1],
                                   [x_dims_mapping, w_dims_mapping, None])
            with paddle.fluid.name_scope('skip_quant'):
                logits = matmul(
                    logits,
                    self.gpt.embeddings.word_embeddings.weight,
                    transpose_y=True)

            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = logits_processors(input_ids, logits)

            # sample
            origin_probs = F.softmax(logits)
            if temperature is None or temperature == 1.0:
                probs = paddle.assign(origin_probs)
                origin_probs = paddle.log(origin_probs)
            else:
                origin_probs = paddle.log(origin_probs)
                logits = logits / temperature
                probs = F.softmax(logits)
            if top_k is not None and top_k != 0:
                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
            if top_p is not None and top_p < 1.0:
                if self.use_topp_sampling:
                    try:
                        from ppfleetx_ops import topp_sampling
                    except ImportError:
                        raise ImportError(
                            "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!"
                        )
                    top_ps_tensor = paddle.full(
                        shape=[paddle.shape(probs)[0]],
                        fill_value=top_p,
                        dtype=probs.dtype)
                    # TODO fake random seed here
                    # Users should set the random seed dynamically when inference
                    _,  next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100)
                else:
                    probs = TopPProcess(probs, top_p, min_tokens_to_keep)

            if not self.use_topp_sampling:
                next_tokens = paddle.multinomial(probs)

            next_scores = paddle.index_sample(origin_probs, next_tokens)

            if eos_token_id is not None:
                next_tokens = paddle.where(
                    unfinished_flag, next_tokens,
                    paddle.full_like(next_tokens, pad_token_id))

            scores = self.update_scores_for_generation(
                scores, next_scores, cur_len - origin_len, unfinished_flag)

            input_ids = next_tokens

            if eos_token_id is not None:
                unfinished_flag = paddle.logical_and(
                    unfinished_flag, next_tokens != eos_token_id)

            model_kwargs = self.update_model_kwargs_for_generation(
                next_tokens,
                outputs,
                model_kwargs,
                is_encoder_decoder=self.is_encoder_decoder)

            return input_ids, scores, unfinished_flag, model_kwargs

        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement
        # the value in model_kwargs should be tensor before while loop
        outputs = _forward_(**model_kwargs)

        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
            outputs, input_ids, cur_len_gpu, origin_len_gpu, scores,
            unfinished_flag, model_kwargs)
        if not self.inference:
            cur_len += 1
        else:
            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
            paddle.increment(cur_len)
        paddle.increment(cur_len_gpu)

        attn_mask = model_kwargs['attention_mask']
        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
        model_kwargs['attention_mask'] = paddle.reshape(
            attn_mask, paddle.shape(attn_mask))
        model_kwargs['cache'] = outputs[1] if isinstance(outputs,
                                                         tuple) else None
        max_length = paddle.to_tensor(max_length)
        while cur_len < max_length:
            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs) 
            # and change it to pass directly to _post_process_ to avoid 
            # closed-loop problem of dynamic-to-static model
            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
                _forward_(**model_kwargs), input_ids, cur_len_gpu,
                origin_len_gpu, scores, unfinished_flag, model_kwargs)
            if not self.inference:
                cur_len += 1
            else:
                # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
                paddle.increment(cur_len)
            paddle.increment(cur_len_gpu)

            # early finish should be True in generation scenes,
            # If users want to test the inference speed, you can just set it False.
            if self.early_finish and not paddle.any(unfinished_flag):
                break

        return model_kwargs['res'][:, origin_len:], scores

    def forward(self, input_ids=None, **model_kwargs):

        max_length = self.max_length
        min_length = self.min_length
        decode_strategy = self.decode_strategy
        temperature = self.temperature
        top_k = self.top_k
        top_p = self.top_p
        repetition_penalty = self.repetition_penalty
        num_beams = self.num_beams
        num_beam_groups = self.num_beam_groups
        length_penalty = self.length_penalty
        early_stopping = self.early_stopping
        bos_token_id = self.bos_token_id
        eos_token_id = self.eos_token_id
        pad_token_id = self.pad_token_id
        decoder_start_token_id = self.decoder_start_token_id
        forced_bos_token_id = self.forced_bos_token_id
        forced_eos_token_id = self.forced_eos_token_id
        num_return_sequences = self.num_return_sequences
        diversity_rate = self.diversity_rate
        use_cache = self.use_cache

        assert (
            decode_strategy in ["greedy_search", "sampling", "beam_search"]
        ), "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format(
            decode_strategy)

        bos_token_id = bos_token_id if bos_token_id is not None else getattr(
            self.gpt, 'bos_token_id', None)
        eos_token_id = eos_token_id if eos_token_id is not None else getattr(
            self.gpt, 'eos_token_id', None)
        pad_token_id = pad_token_id if pad_token_id is not None else getattr(
            self.gpt, 'pad_token_id', None)
        forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr(
            self.gpt, 'forced_bos_token_id', None)
        forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr(
            self.gpt, 'forced_eos_token_id', None)
        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr(
            self.gpt, 'decoder_start_token_id', None)

        # params check
        if input_ids is None:
            # Init `input_ids` with bos_token_id
            input_ids = self.prepare_input_ids_for_generation(bos_token_id)

        if model_kwargs.get("attention_mask", None) is None:
            # TODO
            # Init `attention_mask` depending on `pad_token_id`
            model_kwargs[
                "attention_mask"] = self.prepare_attention_mask_for_generation(
                    input_ids, pad_token_id, eos_token_id)

        if model_kwargs.get("position_ids", None) is None:
            model_kwargs['position_ids'] = paddle.arange(
                0,
                paddle.shape(model_kwargs['attention_mask'])[-1],
                dtype=input_ids.dtype).unsqueeze(0)

        self.is_encoder_decoder = False

        model_kwargs["use_cache"] = use_cache

        if self.inference:
            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
            min_len = input_ids.shape[-1]
            max_len = input_ids.shape[-1]
            paddle.increment(min_len, min_length)
            paddle.increment(max_len, max_length)
        else:
            input_len = input_ids.shape[-1]
            max_len = max_length + input_len
            min_len = min_length + input_len

        logits_processors = self.get_logits_processor(
            min_length=min_len,
            max_length=max_len,
            eos_token_id=eos_token_id,
            forced_bos_token_id=forced_bos_token_id,
            forced_eos_token_id=forced_eos_token_id,
            num_beams=num_beams,
            num_beam_groups=num_beam_groups,
            diversity_rate=diversity_rate,
            repetition_penalty=repetition_penalty)

        if decode_strategy == 'sampling':
            if num_return_sequences > 1:
                input_ids, model_kwargs = self.expand_inputs_for_generation(
                    input_ids,
                    expand_size=num_return_sequences,
                    **model_kwargs)

            ret = self.sample(input_ids, logits_processors, max_len,
                              pad_token_id, eos_token_id, top_k, top_p,
                              temperature, **model_kwargs)
        else:
            raise ValueError(f'Not support {decode_strategy} strategy yet!')
        return ret


================================================
FILE: ppfleetx/models/language_model/gpt/auto/auto_module.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import copy
import argparse

import numpy as np
import paddle
import paddle.distributed as dist
from paddle import LazyGuard
from paddle.static import InputSpec
from paddle.distributed.fleet import auto

from ...auto_utils import process_configs

import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.utils.log import logger
from ppfleetx.data.tokenizers import GPTTokenizer
from ppfleetx.core.module.basic_module import BasicModule


class LanguageModuleAuto(BasicModule):
    def __init__(self, configs):
        self.nranks = dist.get_world_size()
        super(LanguageModuleAuto, self).__init__(configs)

        self.loss_fn = self.get_loss_fn()

    def process_configs(self, configs):
        configs = process_configs(configs)
        return configs

    def get_model_size(self, l, h, v, s):
        P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h))
        logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 /
                                                  1000.0))


class GPTModuleAuto(LanguageModuleAuto):
    def __init__(self, configs):
        super(GPTModuleAuto, self).__init__(configs)

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        l = model_setting['num_layers']
        h = model_setting['hidden_size']
        v = model_setting['vocab_size']
        s = self.configs.Data.Train.dataset.max_seq_len
        self.get_model_size(l, h, v, s)

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        with LazyGuard():
            model = gpt.GPTForPretrainingAuto(
                gpt.GPTModelAuto(**model_setting))
        return model

    def get_loss_fn(self):
        model_setting = copy.deepcopy(self.configs.Model)
        return gpt.GPTPretrainingCriterionAuto(model_setting['mesh'])


class GPTGenerationModuleAuto(BasicModule):
    def __init__(self, configs):
        self.configs = configs
        self.generation_cfgs = configs.Generation
        self.nranks = paddle.distributed.get_world_size()

        super().__init__(configs)

    def process_configs(self, configs):
        configs = process_configs(configs)
        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        with LazyGuard():
            model = gpt.GPTForGenerationAuto(
                gpt.GPTModelAuto(**model_setting), self.generation_cfgs)

        self.tokenizer = GPTTokenizer.from_pretrained("gpt2")

        self.generation_cfgs['max_dec_len'] = self.adjust_length_to_model(
            self.generation_cfgs['max_dec_len'], 512)

        self.generation_cfgs['bos_token_id'] = self.tokenizer.eos_token_id
        self.generation_cfgs['eos_token_id'] = self.tokenizer.eos_token_id
        self.generation_cfgs['pad_token_id'] = self.tokenizer.eos_token_id

        return model

    def adjust_length_to_model(self, length, max_sequence_length):
        if length < 0 or length > max_sequence_length:
            length = max_sequence_length
        return length

    def left_padding(self, inputs, pad_id, padding="longest"):
        assert "input_ids" in inputs, "input_ids should be in inputs!"
        max_length = 0
        for ids in inputs["input_ids"]:
            max_length = max(max_length, len(ids))

        def extend_max_lenth(value, max_length, to_pad_id):
            return [to_pad_id] * (max_length - len(value)) + value

        def extend_filed(name, max_length, to_pad_id):
            values = inputs[name]
            res = []
            for index, value in enumerate(values):
                res.append(extend_max_lenth(value, max_length, to_pad_id))
            inputs[name] = res

        extend_filed("input_ids", max_length, pad_id)
        if "attention_mask" in inputs:
            extend_filed("attention_mask", max_length, 0)
        if "position_ids" in inputs:
            extend_filed("position_ids", max_length, 0)

        return inputs

    def input_spec(self):
        return [InputSpec(shape=[None, None], name="input_ids", dtype='int64')]


================================================
FILE: ppfleetx/models/language_model/gpt/dygraph/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import logging
from distutils.util import strtobool
import os
import math

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.tensor as tensor
from paddle.fluid import layers
from paddle.nn.layer.transformer import _convert_param_attr_to_list
import paddle.incubate as incubate
from paddle.common_ops_import import convert_dtype

import paddle.distributed.fleet as fleet
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc
from paddle.distributed.fleet.utils import recompute
from paddle.autograd import PyLayer
import sys

from .single_model import ExpertLayer
from .sequence_parallel_utils import ScatterOp, GatherOp, \
        mark_as_sequence_parallel_parameter, ColumnSequenceParallelLinear, RowSequenceParallelLinear
from .processor import (
    LogitsProcessorList, MinLengthLogitsProcessor,
    HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor,
    ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor)

from ppfleetx.models.language_model.moe import MoELayer
from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger

import numpy as np

try:
    from paddle.nn.functional.flash_attention import flash_attention
except:
    flash_attention = None


def get_attr(layer, name):
    if getattr(layer, name, None) is not None:
        return getattr(layer, name, None)
    else:
        return get_attr(layer._layer, name)


def parallel_matmul(lm_output, logit_weights, parallel_output):
    """
    """
    hcg = env.get_hcg()
    model_parallel_group = hcg.get_model_parallel_group()
    world_size = hcg.get_model_parallel_world_size()
    rank = hcg.get_model_parallel_rank()

    if world_size > 1:
        input_parallel = paddle.distributed.collective._c_identity(
            lm_output, group=model_parallel_group)

        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)

        if parallel_output:
            return logits

        return paddle.distributed.collective._c_concat(
            logits, group=model_parallel_group)
    else:
        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
        return logits


class MultiHeadAttention(nn.Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    """

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 output_layer_weight_attr=None,
                 bias_attr=None,
                 fuse_attn_qkv=False,
                 scale_qk_coeff=1.0,
                 num_partitions=1,
                 fused_linear=False,
                 use_recompute=False,
                 recompute_granularity="full",
                 sequence_parallel=False,
                 do_recompute=True,
                 use_flash_attn=False):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights
        self.fuse_attn_qkv = fuse_attn_qkv
        self.scale_qk_coeff = scale_qk_coeff
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.do_recompute = do_recompute
        self.sequence_parallel = sequence_parallel
        self.use_flash_attn = use_flash_attn if flash_attention else None

        if sequence_parallel:
            ColumnParallelLinear = ColumnSequenceParallelLinear
            RowParallelLinear = RowSequenceParallelLinear
        else:
            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
            RowParallelLinear = fleet.meta_parallel.RowParallelLinear

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        assert self.num_heads % num_partitions == 0, "num_heads {} must be divisible by num_partitions {}".format(
            self.num_heads, num_partitions)
        self.num_heads = self.num_heads // num_partitions

        if self.fuse_attn_qkv:
            assert self.kdim == embed_dim
            assert self.vdim == embed_dim

            self.qkv_proj = ColumnParallelLinear(
                embed_dim,
                3 * embed_dim,
                mp_group=env.get_hcg().get_model_parallel_group(),
                weight_attr=weight_attr,
                has_bias=True,
                gather_output=False,
                fuse_matmul_bias=fused_linear)
        else:
            self.q_proj = ColumnParallelLinear(
                embed_dim,
                embed_dim,
                mp_group=env.get_hcg().get_model_parallel_group(),
                weight_attr=weight_attr,
                has_bias=True,
                gather_output=False,
                fuse_matmul_bias=fused_linear)

            self.k_proj = ColumnParallelLinear(
                self.kdim,
                embed_dim,
                mp_group=env.get_hcg().get_model_parallel_group(),
                weight_attr=weight_attr,
                has_bias=True,
                gather_output=False,
                fuse_matmul_bias=fused_linear)

            self.v_proj = ColumnParallelLinear(
                self.vdim,
                embed_dim,
                mp_group=env.get_hcg().get_model_parallel_group(),
                weight_attr=weight_attr,
                has_bias=True,
                gather_output=False,
                fuse_matmul_bias=fused_linear)

        self.out_proj = RowParallelLinear(
            embed_dim,
            embed_dim,
            mp_group=env.get_hcg().get_model_parallel_group(),
            weight_attr=output_layer_weight_attr,
            has_bias=True,
            input_is_parallel=True,
            fuse_matmul_bias=fused_linear)

    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):
        mix_layer = self.qkv_proj(query)
        mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim])
        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)

        assert not isinstance(
            cache, self.StaticCache
        ), "cache currently does not support the StaticCache type"

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=1)
            v = tensor.concat([cache.v, v], axis=1)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v, cache) if use_cache else (q, k, v, None)

    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.

        """
        q = self.q_proj(query)
        q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=1)
            v = tensor.concat([cache.v, v], axis=1)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v, cache) if use_cache else (q, k, v, None)

    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.

        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.

        """
        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim])
        v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim])
        return k, v

    def gen_cache(self, key, value=None, type=Cache):
        """
        Generates cache for `forward` usage in inference accroding to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.
        """
        if type == MultiHeadAttention.StaticCache:  # static_kv
            k, v = self.compute_kv(key, value)
            return self.StaticCache(k, v)
        elif value is None:  # incremental_state
            k = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            v = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            return self.Cache(k, v)
        else:
            # incremental_state with initial value, mainly for usage like UniLM
            return self.Cache(key, value)

    def _flash_attention(self, q, k, v, attn_mask=None):
        if self.sequence_parallel:
            perm = [1, 0, 2, 3]
            q = tensor.transpose(x=q, perm=perm)
            k = tensor.transpose(x=k, perm=perm)
            v = tensor.transpose(x=v, perm=perm)
        out, weights = flash_attention(
            q,
            k,
            v,
            self.dropout,
            causal=True,
            return_softmax=self.need_weights)
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
        if self.sequence_parallel:
            perm = [1, 0, 2]
            out = tensor.transpose(x=out, perm=perm)
        return out, weights

    def core_attn(self, q, k, v, attn_mask=None):
        perm = [1, 2, 0, 3] if self.sequence_parallel else [0, 2, 1, 3]
        q = tensor.transpose(x=q, perm=perm)
        k = tensor.transpose(x=k, perm=perm)
        v = tensor.transpose(x=v, perm=perm)

        # scale dot product attention
        scale_qk_coeff = self.scale_qk_coeff * self.head_dim**0.5
        product = paddle.matmul(
            x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)

        if self.scale_qk_coeff != 1.0:
            product = product.scale(self.scale_qk_coeff)

        # softmax_mask_fuse_upper_triangle is not supported sif paddle is not compiled with cuda/rocm
        if not paddle.is_compiled_with_cuda():
            attn_mask = get_triangle_upper_mask(product, attn_mask)

        if attn_mask is not None:
            product = product + attn_mask
            weights = F.softmax(product)
        else:
            weights = incubate.softmax_mask_fuse_upper_triangle(product)

        if self.dropout:
            with get_rng_state_tracker().rng_state('local_seed'):
                weights = F.dropout(
                    weights,
                    self.dropout,
                    training=self.training,
                    mode="upscale_in_train")

        out = paddle.matmul(weights, v)

        # combine heads
        if self.sequence_parallel:
            out = tensor.transpose(out, perm=[2, 0, 1, 3])
        else:
            out = tensor.transpose(out, perm=[0, 2, 1, 3])
        # If sequence_parallel is true, out shape is [s, b, h] after reshape
        # else out shape is [b, s, h]
        out = tensor.reshape(x=out, shape=[0, 0, -1])

        return out, weights

    def forward(self,
                query,
                key,
                value,
                attn_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.
        """
        key = query if key is None else key
        value = query if value is None else value
        # if sequence_parallel is true, query, key, value shape are [s, b, h],
        # else their shape are [b, s, h], n is mp parallelism.
        # no matter sequence_parallel is true or false,
        # after reshape, q, k, v shape should be [b, num_heads/n, s, head_dim]
        # compute q ,k ,v
        if self.fuse_attn_qkv:
            q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                               cache)

        if self.use_flash_attn and attn_mask is None:
            attn_func = self._flash_attention
        else:
            attn_func = self.core_attn

        if self.use_recompute and self.recompute_granularity == "core_attn" and self.do_recompute:
            out, weights = recompute(attn_func, q, k, v, attn_mask)
        else:
            out, weights = attn_func(q, k, v, attn_mask=attn_mask)

        # project to output
        # if sequence_parallel is true, out shape are [s/n, b, h],
        # else their shape are [b, s, h], n is mp parallelism.
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if use_cache:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerDecoder(nn.Layer):
    """
    TransformerDecoder is a stack of N decoder layers.
    """

    def __init__(self,
                 decoder_layers,
                 num_layers,
                 norm=None,
                 hidden_size=None,
                 use_recompute=False,
                 recompute_granularity="full",
                 sequence_parallel=False,
                 no_recompute_layers=None):
        super(TransformerDecoder, self).__init__()

        if no_recompute_layers is None:
            no_recompute_layers = []
        self.no_recompute_layers = no_recompute_layers

        self.num_layers = num_layers
        self.layers = decoder_layers
        self.norm = norm
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.sequence_parallel = sequence_parallel
        if norm == "LayerNorm":
            self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
            # if sequence parallel is true,
            # register hook to all_reduce gradient of weight, bias
            if self.sequence_parallel:
                mark_as_sequence_parallel_parameter(self.norm.weight)
                mark_as_sequence_parallel_parameter(self.norm.bias)
        elif norm is not None:
            raise ValueError("Only support LayerNorm")

    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last decoder
        layer.
        """
        output = tgt
        new_caches = []

        for i, mod in enumerate(self.layers):
            if cache is None:
                if use_cache:
                    output, new_cache = mod(output,
                                            memory,
                                            tgt_mask=tgt_mask,
                                            use_cache=use_cache,
                                            cache=cache)
                    new_caches.append(new_cache)
                else:
                    if self.use_recompute and self.recompute_granularity == "full" and i not in self.no_recompute_layers:
                        output = recompute(mod, output, memory, tgt_mask,
                                           use_cache, cache)
                    else:
                        output = mod(output, memory, tgt_mask, use_cache,
                                     cache)

            else:
                output, new_cache = mod(output,
                                        memory,
                                        tgt_mask=tgt_mask,
                                        use_cache=use_cache,
                                        cache=cache[i])
                new_caches.append(new_cache)

        if self.norm is not None:
            output = self.norm(output)
        return output if use_cache is False else (output, new_caches)

    def gen_cache(self, memory, do_zip=False):
        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
        for more details. If `do_zip` is True, apply `zip` on these tuples to get
        a list with two elements.
       """
        cache = [layer.gen_cache(memory) for layer in self.layers]
        if do_zip:
            cache = list(zip(*cache))
        return cache


class TransformerDecoderLayer(nn.Layer):
    """
    The transformer decoder layer.

    It contains multiheadattention and some linear layers.
    """

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 weight_attr=None,
                 output_layer_weight_attr=None,
                 bias_attr=None,
                 num_partitions=1,
                 fused_linear=False,
                 fuse_attn_qkv=False,
                 scale_qk_coeff=1.0,
                 moe_configs=None,
                 recompute_attn=False,
                 use_recompute=False,
                 recompute_granularity="full",
                 sequence_parallel=False,
                 do_recompute=True,
                 skip_quant_tensors=[],
                 use_flash_attn=False):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.sequence_parallel = sequence_parallel
        self.do_recompute = do_recompute

        self.expert_mode = False
        # moe config
        if moe_configs is not None:
            self.gate = moe_configs.get('gate', 'gshard')
            self.top_k = moe_configs.get('top_k', 2)
            self.num_experts = moe_configs.get('num_experts', 1)
            self.expert_mode = moe_configs.get('expert_mode', False)

        if sequence_parallel:
            ColumnParallelLinear = ColumnSequenceParallelLinear
            RowParallelLinear = RowSequenceParallelLinear
        else:
            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
            RowParallelLinear = fleet.meta_parallel.RowParallelLinear

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
        output_layer_weight_attrs = _convert_param_attr_to_list(
            output_layer_weight_attr, 3)

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0],
            output_layer_weight_attr=output_layer_weight_attrs[0],
            num_partitions=num_partitions,
            fused_linear=fused_linear,
            fuse_attn_qkv=fuse_attn_qkv,
            scale_qk_coeff=scale_qk_coeff,
            use_recompute=use_recompute,
            recompute_granularity=recompute_granularity,
            sequence_parallel=sequence_parallel,
            do_recompute=do_recompute,
            use_flash_attn=use_flash_attn)

        if self.expert_mode:
            experts_list = nn.LayerList([
                ExpertLayer(d_model, dim_feedforward)
                for e in range(self.num_experts)
            ])

            hcg = env.get_hcg()
            moe_group = hcg.get_expert_parallel_group()
            mp_group = hcg.get_model_parallel_group()

            self.moe_mlp = MoELayer(
                d_model=d_model,
                experts=experts_list,
                gate=self.gate,
                top_k=self.top_k,
                moe_group=moe_group,
                mp_group=mp_group,
                recompute_interval=int(self.use_recompute))
        else:
            self.linear1 = ColumnParallelLinear(
                d_model,
                dim_feedforward,
                mp_group=env.get_hcg().get_model_parallel_group(),
                weight_attr=weight_attrs[2],
                gather_output=False,
                has_bias=True,
                fuse_matmul_bias=fused_linear)

            self.linear2 = RowParallelLinear(
                dim_feedforward,
                d_model,
                mp_group=env.get_hcg().get_model_parallel_group(),
                weight_attr=output_layer_weight_attrs[2],
                input_is_parallel=True,
                has_bias=True,
                fuse_matmul_bias=fused_linear)

            if 'linear1' in skip_quant_tensors:
                self.linear1.skip_quant = True

            if 'linear2' in skip_quant_tensors:
                self.linear2.skip_quant = True

        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
        if self.sequence_parallel:
            # if sequence parallel is true, register hook to all_reduce gradient of bias
            mark_as_sequence_parallel_parameter(self.norm1.weight)
            mark_as_sequence_parallel_parameter(self.norm1.bias)
            mark_as_sequence_parallel_parameter(self.norm2.weight)
            mark_as_sequence_parallel_parameter(self.norm2.bias)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)

    def forward(self,
                tgt,
                memory=None,
                tgt_mask=None,
                use_cache=False,
                cache=None):
        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            if self.use_recompute and self.recompute_granularity == "full_attn" and self.do_recompute:
                tgt = recompute(self.self_attn, tgt, None, None, tgt_mask,
                                use_cache, cache)
            else:
                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)
        # If use sequence_parallel, different input partition in dropout
        # should use different seed.
        if self.sequence_parallel:
            current_seed = 'local_seed'
        else:
            current_seed = 'global_seed'
        with get_rng_state_tracker().rng_state(current_seed):
            tgt = residual + self.dropout1(tgt)

        if not self.normalize_before:
            tgt = self.norm1(tgt)

        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)

        if self.expert_mode:
            tgt = self.moe_mlp(tgt)
        else:
            with get_rng_state_tracker().rng_state(current_seed):
                tgt = self.dropout2(
                    self.linear2(F.gelu(
                        self.linear1(tgt), approximate=True)))

        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)

    def gen_cache(self, memory):
        incremental_cache = self.self_attn.gen_cache(
            memory, type=self.self_attn.Cache)
        return incremental_cache


class GPTEmbeddings(nn.Layer):
    """
    Include embeddings from word and position embeddings.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 hidden_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 sequence_parallel=False,
                 freeze_embedding=False):
        super(GPTEmbeddings, self).__init__()

        self.sequence_parallel = sequence_parallel
        self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
            vocab_size,
            hidden_size,
            mp_group=env.get_hcg().get_model_parallel_group(),
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
                mean=0.0, std=initializer_range)))

        self.position_embeddings = nn.Embedding(
            max_position_embeddings,
            hidden_size,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
                mean=0.0, std=initializer_range)))

        if freeze_embedding:
            self.word_embeddings.weight.learning_rate = 0.0
            self.position_embeddings.weight.learning_rate = 0.0

        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, position_ids=None):
        if position_ids is None:
            ones = paddle.ones_like(input_ids, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=-1)
            position_ids = seq_length - ones

        input_embedings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embedings + position_embeddings
        # if sequence parallel is true, change embedding shape [b, s, h] to [s, b, h]
        # set the sequence dim as first, so the split in sequence dim is data-continuous
        if self.sequence_parallel:
            embeddings = paddle.transpose(embeddings, perm=[1, 0, 2])
            embeddings = ScatterOp.apply(embeddings)
            with get_rng_state_tracker().rng_state('local_seed'):
                embeddings = self.dropout(embeddings)
        else:
            embeddings = self.dropout(embeddings)
        return embeddings


class GPTModelHybrid(nn.Layer):
    def __init__(self,
                 vocab_size=51200,
                 hidden_size=768,
                 num_layers=12,
                 num_attention_heads=12,
                 ffn_hidden_size=3072,
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 num_partitions=1,
                 moe_configs=None,
                 use_recompute=False,
                 fused_linear=False,
                 fuse_attn_qkv=False,
                 scale_qk_by_layer_num=True,
                 recompute_granularity="full",
                 sequence_parallel=False,
                 no_recompute_layers=None,
                 skip_tensor_map={},
                 freeze_embedding=False,
                 use_flash_attn=False,
                 fused_softmax_with_triangular=False):

        super(GPTModelHybrid, self).__init__()

        if no_recompute_layers is None:
            no_recompute_layers = []
        self.initializer_range = initializer_range
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.fused_softmax_with_triangular = fused_softmax_with_triangular

        if use_flash_attn:
            if flash_attention:
                logger.info("Flash-attention enabled.")
            else:
                use_flash_attn = False
                logger.warning(
                    "Flash-attention is not support in this Paddle version.")

        hcg = env.get_hcg()
        mp_size = hcg.get_model_parallel_world_size()
        if mp_size <= 1:
            sequence_parallel = False
            logging.warning(
                "If mp_size <= 1, sequence_parallel strategy will be turned off in GPTModelHybrid model."
            )

        self.embeddings = GPTEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, self.initializer_range,
            sequence_parallel, freeze_embedding)
        self.sequence_parallel = sequence_parallel

        decoder_layers = nn.LayerList()
        for i in range(num_layers):
            decoder_layers.append(
                TransformerDecoderLayer(
                    d_model=hidden_size,
                    nhead=num_attention_heads,
                    dim_feedforward=ffn_hidden_size,
                    dropout=hidden_dropout_prob,
                    activation="gelu",
                    attn_dropout=attention_probs_dropout_prob,
                    act_dropout=hidden_dropout_prob,
                    weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
                            mean=0.0, std=self.initializer_range)),
                    output_layer_weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
                            mean=0.0,
                            std=self.initializer_range / math.sqrt(
                                2.0 * num_layers))),
                    bias_attr=None,
                    num_partitions=num_partitions,
                    fused_linear=fused_linear,
                    fuse_attn_qkv=fuse_attn_qkv,
                    scale_qk_coeff=num_layers
                    if scale_qk_by_layer_num else 1.0,
                    moe_configs=moe_configs,
                    use_recompute=use_recompute,
                    recompute_granularity=recompute_granularity,
                    sequence_parallel=sequence_parallel,
                    do_recompute=i not in no_recompute_layers,
                    skip_quant_tensors=skip_tensor_map.get('block_{}'.format(
                        i), []),
                    use_flash_attn=use_flash_attn))

        self.decoder = TransformerDecoder(
            decoder_layers,
            num_layers,
            norm="LayerNorm",
            hidden_size=hidden_size,
            use_recompute=use_recompute,
            recompute_granularity=recompute_granularity,
            sequence_parallel=sequence_parallel,
            no_recompute_layers=no_recompute_layers)

    def forward(self,
                input_ids,
                position_ids=None,
                attention_mask=None,
                use_cache=False,
                cache=None):

        if position_ids is None:
            past_length = 0
            if cache is not None:
                past_length = paddle.shape(attention_mask)[-1] - 1
            position_ids = paddle.arange(
                past_length,
                paddle.shape(input_ids)[-1] + past_length,
                dtype=input_ids.dtype)
            position_ids = position_ids.unsqueeze(0)
            # .expand_as(input_ids)
            position_ids = paddle.expand_as(position_ids, input_ids)
        # if sequence_parallel is true, embedding_output shape is [s/n, b, h]
        # else its shape is [b, s, h], n is mp parallelism
        embedding_output = self.embeddings(
            input_ids=input_ids, position_ids=position_ids)

        # fused_softmax_with_triangular is only suppported on GPU/DCU.
        # If on non-GPU devices, we use user defined mask and non-fused softmax.
        if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(
        ):
            # TODO, use registered buffer
            causal_mask = paddle.tensor.triu(
                paddle.ones(
                    (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1]))
                * -1e4,
                diagonal=1)
            if attention_mask is not None:
                if len(attention_mask.shape) == 2:
                    attention_mask = attention_mask[:, None, None, :]
                attention_mask = attention_mask + causal_mask
            else:
                attention_mask = causal_mask
            # The tensor returned by triu not in static graph.
            attention_mask.stop_gradient = True

        encoder_outputs = self.decoder(
            embedding_output,
            memory=None,
            tgt_mask=None if (self.fused_softmax_with_triangular and
                              self.training and paddle.is_compiled_with_cuda())
            else attention_mask,  # use softmax_mask_fuse_upper_triangle
            use_cache=use_cache,
            cache=cache)

        if self.sequence_parallel:
            encoder_outputs = GatherOp.apply(encoder_outputs)

        return encoder_outputs


class GPTForPretrainingHybrid(nn.Layer):
    """
    GPT Model with pretraining tasks on top.

    Args:
        gpt (:class:`GPTModel`):
            An instance of :class:`GPTModel`.

    """

    def __init__(self, gpt):
        super(GPTForPretrainingHybrid, self).__init__()
        self.gpt = gpt
        # extra_parameters using for sharding stage3 to register extra_parameters
        self.extra_parameters = [
            get_attr(self.gpt.embeddings.word_embeddings, "weight")
        ]

    def forward(self,
                input_ids,
                position_ids=None,
                attention_mask=None,
                masked_positions=None,
                use_cache=False,
                cache=None):

        outputs = self.gpt(input_ids,
                           position_ids=position_ids,
                           attention_mask=attention_mask,
                           use_cache=use_cache,
                           cache=cache)
        if use_cache:
            encoder_outputs, cached_kvs = outputs[:2]
        else:
            encoder_outputs = outputs

        logits = parallel_matmul(
            encoder_outputs,
            get_attr(self.gpt.embeddings.word_embeddings, "weight"), True)

        if use_cache:
            return logits, cached_kvs
        else:
            return logits


class GPTPretrainingCriterionHybird(nn.Layer):
    """
    Criterion for GPT. It calculates the final loss.
    """

    def __init__(self, topo=None, sequence_parallel=False):
        super(GPTPretrainingCriterionHybird, self).__init__()
        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
        self.parallel_loss_func = \
            fleet.meta_parallel.ParallelCrossEntropy(mp_group=env.get_hcg().get_model_parallel_group())
        self.sequence_parallel = sequence_parallel

    def forward(self, prediction_scores, masked_lm_labels, loss_mask):
        """
        Args:
            prediction_scores(Tensor):
                The logits of masked token prediction. Its data type should be float32 and
                its shape is [batch_size, sequence_length, vocab_size].
            masked_lm_labels(Tensor):
                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`
                is equal to `prediction_scores`. Its data type should be int64 and
                its shape is [batch_size, sequence_length, 1].
            loss_mask(Tensor):
                Mask used for calculating the loss of the masked language modeling to avoid
                calculating some unwanted tokens.
                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].

        Returns:
            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].

        """
        hcg = env.get_hcg()
        mp_size = hcg.get_model_parallel_world_size()
        if self.sequence_parallel:
            masked_lm_labels = masked_lm_labels.transpose([1, 0])
            loss_mask = loss_mask.transpose([1, 0])

        if mp_size > 1:
            if paddle.is_compiled_with_cuda() and True:
                masked_lm_loss = self.parallel_loss_func(
                    prediction_scores, masked_lm_labels.unsqueeze(2))
            else:
                prediction_scores = ConcatSoftmaxInput.apply(
                    prediction_scores,
                    group=env.get_hcg().get_model_parallel_group())
                masked_lm_loss = self.loss_func(prediction_scores,
                                                masked_lm_labels.unsqueeze(2))
        else:
            masked_lm_loss = self.loss_func(prediction_scores,
                                            masked_lm_labels.unsqueeze(2))
        loss_mask = loss_mask.reshape([-1])
        masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
        loss = masked_lm_loss / loss_mask.sum()
        return loss


# these Layers is just for PipelineParallel


class GPTPretrainingCriterionPipe(GPTPretrainingCriterionHybird):
    """Extends GPTPretrainingCriterion to meet the input standard."""

    def forward(self, prediction_scores, args):
        masked_lm_labels = args[0]
        loss_mask = args[1]
        loss = super().forward(prediction_scores, masked_lm_labels, loss_mask)
        return loss


class EmbeddingPipe(GPTEmbeddings):
    """Extends GPTEmbeddings to forward attention_mask through the pipeline."""

    @property
    def embedding_weight(self):
        return get_attr(self.word_embeddings, "weight")

    def forward(self, tensors):
        input_ids, position_ids = tensors
        embeddings = super().forward(
            input_ids=input_ids, position_ids=position_ids)
        return embeddings


class LayerNormPipe(nn.Layer):
    def __init__(self,
                 normalized_shape,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
                 name=None,
                 sequence_parallel=False,
                 is_last=False):
        super(LayerNormPipe, self).__init__()
        self.sequence_parallel = sequence_parallel
        self.is_last = is_last
        self.norm = nn.LayerNorm(
            normalized_shape=normalized_shape,
            epsilon=epsilon,
            weight_attr=weight_attr,
            bias_attr=bias_attr,
            name=name)
        if self.sequence_parallel:
            mark_as_sequence_parallel_parameter(self.norm.weight)
            mark_as_sequence_parallel_parameter(self.norm.bias)

    def forward(self, input):
        output = self.norm(input)
        if self.sequence_parallel and self.is_last:
            output = GatherOp.apply(output)
        return output


class GPTForPretrainingPipe(PipelineLayer):
    """GPTForPretraining adapted for pipeline parallelism.

    The largest change is flattening the GPTModel class so we can express it as a
    sequence of layers including embedding, transformer layers, and output.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_layers=12,
                 num_attention_heads=12,
                 ffn_hidden_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 num_partitions=1,
                 topology=None,
                 use_recompute=False,
                 fused_linear=False,
                 fuse_attn_qkv=False,
                 scale_qk_by_layer_num=True,
                 moe_configs=None,
                 recompute_granularity="full",
                 virtual_pp_degree=1,
                 sequence_parallel=False,
                 no_recompute_layers=None,
                 pp_recompute_interval=1,
                 use_flash_attn=False,
                 fused_softmax_with_triangular=False):

        # forward desc
        self.descs = []

        if no_recompute_layers is None:
            no_recompute_layers = []
        else:
            if recompute_granularity == 'full':
                assert len(no_recompute_layers) == 0, \
                    "for pp with full recompute, no_recompute_layers is not support"

        if use_flash_attn:
            if flash_attention:
                logger.info("Flash-attention enabled.")
            else:
                use_flash_attn = False
                logger.warning(
                    "Flash-attention is not support in this Paddle version.")

        hcg = env.get_hcg()
        mp_size = hcg.get_model_parallel_world_size()
        if mp_size <= 1:
            sequence_parallel = False
            logging.warning(
                "If mp_size <= 1, sequence_parallel strategy will be turned off in GPTForPretrainingPipe model."
            )

        self.descs.append(
            SharedLayerDesc(
                'embed',
                EmbeddingPipe,
                shared_weight_attr='embedding_weight',
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                hidden_dropout_prob=hidden_dropout_prob,
                max_position_embeddings=max_position_embeddings,
                type_vocab_size=type_vocab_size,
                initializer_range=0.02,
                sequence_parallel=sequence_parallel))

        for i in range(num_layers):
            self.descs.append(
                LayerDesc(
                    TransformerDecoderLayer,
                    d_model=hidden_size,
                    nhead=num_attention_heads,
                    dim_feedforward=ffn_hidden_size,
                    dropout=hidden_dropout_prob,
                    activation=hidden_act,
                    attn_dropout=attention_probs_dropout_prob,
                    act_dropout=hidden_dropout_prob,
                    weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
                            mean=0.0, std=initializer_range)),
                    output_layer_weight_attr=paddle.
                    ParamAttr(initializer=nn.initializer.Normal(
                        mean=0.0,
                        std=initializer_range / math.sqrt(2.0 * num_layers))),
                    bias_attr=None,
                    num_partitions=num_partitions,
                    moe_configs=moe_configs,
                    fused_linear=fused_linear,
                    fuse_attn_qkv=fuse_attn_qkv,
                    scale_qk_coeff=num_layers
                    if scale_qk_by_layer_num else 1.0,
                    use_recompute=use_recompute,
                    recompute_granularity=recompute_granularity,
                    sequence_parallel=sequence_parallel,
                    do_recompute=i not in no_recompute_layers,
                    use_flash_attn=use_flash_attn))

        self.descs.append(
            LayerDesc(
                LayerNormPipe,
                normalized_shape=hidden_size,
                sequence_parallel=sequence_parallel,
                is_last=True))

        def _logits_helper(embedding, output):
            return parallel_matmul(output, embedding.embedding_weight, True)

        self.descs.append(
            SharedLayerDesc(
                'embed',
                EmbeddingPipe,
                forward_func=_logits_helper,
                shared_weight_attr='embedding_weight',
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                hidden_dropout_prob=hidden_dropout_prob,
                max_position_embeddings=max_position_embeddings,
                type_vocab_size=type_vocab_size,
                initializer_range=0.02))

        recompute_interval = 0
        if recompute and recompute_granularity == "full":
            assert pp_recompute_interval <= \
                   num_layers // (virtual_pp_degree *
                                  env.get_hcg().topology().get_dim_size("pipe")), \
                "pp recompute interval should smaller than num layers of each pp chunk"
            recompute_interval = pp_recompute_interval

        seg_method = "layer:TransformerDecoderLayer"
        if num_layers % env.get_hcg().topology().get_dim_size("pipe") != 0:
            seg_method = "uniform"

        super().__init__(
            layers=self.descs,
            loss_fn=GPTPretrainingCriterionPipe(
                sequence_parallel=sequence_parallel),
            topology=env.get_hcg().topology(),
            seg_method=seg_method,
            recompute_interval=recompute_interval,
            recompute_ctx={
                "mp_group": env.get_hcg().get_model_parallel_group(),
                "offload": False,
                "partition": False,
            },
            num_virtual_pipeline_stages=virtual_pp_degree)


class GPTForGenerationHybrid(nn.Layer):
    """
    GPT Model with pretraining tasks on top.

    Args:
        gpt (:class:`GPTModel`):
            An instance of :class:`GPTModel`.

    """

    def __init__(self, gpt, configs):
        super(GPTForGenerationHybrid, self).__init__()
        self.gpt = gpt
        # extra_parameters using for sharding stage3 to register extra_parameters
        self.extra_parameters = [
            get_attr(self.gpt.embeddings.word_embeddings, "weight")
        ]
        self.configs = configs

        self.max_length = self.configs.get('max_dec_len', 20)
        self.min_length = self.configs.get('min_dec_len', 0)
        self.decode_strategy = self.configs.get('decode_strategy', 'sampling')
        self.temperature = self.configs.get('temperature', 1.0)
        self.top_k = self.configs.get('top_k', 0)
        self.top_p = self.configs.get('top_p', 1.0)
        self.repetition_penalty = self.configs.get('repetition_penalty', 1.0)
        self.num_beams = self.configs.get('num_beams', 1)
        self.num_beam_groups = self.configs.get('num_beam_groups', 1)
        self.length_penalty = self.configs.get('length_penalty', 0.0)
        self.early_stopping = self.configs.get('early_stopping', False)
        self.bos_token_id = self.configs.get('bos_token_id', None)
        self.eos_token_id = self.configs.get('eos_token_id', None)
        self.pad_token_id = self.configs.get('pad_token_id', None)
        self.decoder_start_token_id = self.configs.get(
            'decoder_start_token_id', None)
        self.forced_bos_token_id = self.configs.get('forced_bos_token_id',
                                                    None)
        self.forced_eos_token_id = self.configs.get('forced_eos_token_id',
                                                    None)
        self.num_return_sequences = self.configs.get('num_return_sequences', 1)
        self.diversity_rate = self.configs.get('diversity_rate', 0.0)
        self.use_cache = self.configs.get('use_cache', True)

    def prepare_input_ids_for_generation(self,
                                         bos_token_id,
                                         encoder_output=None):
        batch_size = 1
        if bos_token_id is None:
            raise ValueError("`bos_token_id` should be defined when no "
                             "`input_ids` are provided.")
        if encoder_output is not None:
            batch_size = encoder_output.shape[0]
        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id

    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id,
                                              eos_token_id):
        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(
            input_ids == pad_token_id).numpy().item()
        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
            (eos_token_id is not None) and (pad_token_id != eos_token_id))
        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
            attention_mask = (input_ids == pad_token_id
                              ).astype(paddle.get_default_dtype()) * -1e9
        else:
            attention_mask = paddle.zeros_like(
                input_ids, dtype=paddle.get_default_dtype())
        return paddle.unsqueeze(attention_mask, axis=[1, 2])

    def update_scores_for_generation(self, scores, next_scores, length,
                                     unfinished_flag):
        # update scores

        unfinished_scores = (scores * length + next_scores) / (length + 1)
        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
        return scores

    def get_logits_processor(self,
                             min_length=None,
                             max_length=None,
                             eos_token_id=None,
                             forced_bos_token_id=None,
                             forced_eos_token_id=None,
                             num_beams=1,
                             num_beam_groups=1,
                             diversity_rate=0.0,
                             repetition_penalty=None):
        processors = LogitsProcessorList()

        if min_length is not None and eos_token_id is not None and min_length > -1:
            processors.append(
                MinLengthLogitsProcessor(min_length, eos_token_id))
        if num_beam_groups > 1 and diversity_rate > 0.0:
            processors.append(
                HammingDiversityLogitsProcessor(
                    diversity_rate=diversity_rate,
                    num_beams=num_beams,
                    num_beam_groups=num_beam_groups))
        if repetition_penalty is not None and repetition_penalty != 1.0:
            processors.append(
                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
        if forced_bos_token_id is not None:
            processors.append(
                ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
        if forced_eos_token_id is not None:
            processors.append(
                ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
        # TODO
        # Add more pre_processing for distribution

        return processors

    def expand_inputs_for_generation(self,
                                     input_ids,
                                     expand_size,
                                     attention_mask=None,
                                     **model_kwargs):

        index = paddle.tile(
            paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1),
            [1, expand_size]).reshape([-1])

        input_ids = paddle.gather(input_ids, index)

        if attention_mask is not None:
            model_kwargs["attention_mask"] = paddle.gather(attention_mask,
                                                           index)

        if "token_type_ids" in model_kwargs and model_kwargs[
                "token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = paddle.gather(token_type_ids,
                                                           index)

        if "position_ids" in model_kwargs and model_kwargs[
                "position_ids"] is not None:
            position_ids = model_kwargs["position_ids"]
            model_kwargs["position_ids"] = paddle.gather(position_ids, index)

        if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None:
            seq_len = model_kwargs["seq_len"]
            model_kwargs["seq_len"] = paddle.gather(seq_len, index)

        if "encoder_output" in model_kwargs and model_kwargs[
                "encoder_output"] is not None:
            encoder_output = model_kwargs["encoder_output"]
            model_kwargs["encoder_output"] = paddle.gather(encoder_output,
                                                           index)

        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = paddle.gather(role_ids, index)

        return input_ids, model_kwargs

    def prepare_inputs_for_generation(self,
                                      input_ids,
                                      use_cache=False,
                                      cache=None,
                                      **kwargs):
        # only last token for inputs_ids if cache is defined in kwargs
        position_ids = kwargs.get("position_ids", None)
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            if len(attention_mask.shape) == 4:
                attention_mask = attention_mask[:, -1, -1, :]
            if "int" in paddle.common_ops_import.convert_dtype(
                    attention_mask.dtype):
                attention_mask = (1.0 - attention_mask) * -1e4
        if cache is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)
            if position_ids is not None:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        return {
            "input_ids": input_ids,
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "cache": cache
        }

    def update_model_kwargs_for_generation(self,
                                           outputs,
                                           model_kwargs,
                                           is_encoder_decoder=False):
        # Update the model inputs during generation.
        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
        # and they contain pad value, the result vectors updated by this method
        # may be different from expected. In this case, you need to rewrite the
        # method.

        # update cache
        if isinstance(outputs, tuple):
            model_kwargs["cache"] = outputs[1]

        # update token_type_ids with last value
        if "token_type_ids" in model_kwargs and model_kwargs[
                "token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = paddle.concat(
                [token_type_ids, token_type_ids[:, -1:]], axis=-1)

        # update position_ids
        if "position_ids" in model_kwargs and model_kwargs[
                "position_ids"] is not None:
            position_ids = model_kwargs["position_ids"]
            model_kwargs["position_ids"] = paddle.concat(
                [position_ids, position_ids[:, -1:] + 1], axis=-1)

        # update attention_mask
        if not is_encoder_decoder and "attention_mask" in model_kwargs:
            attention_mask = model_kwargs["attention_mask"]
            # nn.Pad2D don't support the data type `bool`
            if convert_dtype(attention_mask.dtype) == 'bool':
                attention_mask = paddle.cast(attention_mask, 'int64')
            if len(attention_mask.shape) == 4:
                attention_mask = nn.Pad2D(
                    [0, 0, 0, 1], mode='replicate')(attention_mask)
                attention_mask = nn.Pad2D(
                    [0, 1, 0, 0], value=-1e4)(attention_mask)
                dtype = convert_dtype(attention_mask.dtype)
                if 'int' in dtype:
                    attention_mask[:, :, -1, -1] = 1
                elif 'float' in dtype:
                    attention_mask[:, :, -1, -1] = 0.0
                else:
                    raise ValueError(
                        'The data type of input `attention_mask` must '
                        'be bool, int or float')
            else:
                attention_mask = paddle.concat(
                    [
                        attention_mask, paddle.ones(
                            [attention_mask.shape[0], 1], dtype="int64")
                    ],
                    axis=-1)
            model_kwargs["attention_mask"] = attention_mask

        # update role_ids
        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = paddle.concat(
                [role_ids, role_ids[:, -1:]], axis=-1)

        return model_kwargs

    def sample(self,
               input_ids,
               logits_processors,
               max_length,
               pad_token_id,
               eos_token_id,
               top_k=None,
               top_p=None,
               temperature=None,
               min_tokens_to_keep=1,
               **model_kwargs):
        def TopKProcess(probs, top_k, min_tokens_to_keep):
            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
            # Remove all tokens with a probability less than the last token of the top-k
            topk_probs, _ = paddle.topk(probs, k=top_k)
            probs = paddle.where(probs >= topk_probs[:, -1:], probs,
                                 paddle.full_like(probs, 0.0))
            return probs

        def TopPProcess(probs, top_p, min_tokens_to_keep):
            sorted_probs = paddle.sort(probs, descending=True)
            sorted_indices = paddle.argsort(probs, descending=True)
            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)

            # Remove tokens with cumulative probs above the top_p, But keep at
            # least min_tokens_to_keep tokens
            sorted_indices_to_remove = cumulative_probs > top_p
            if min_tokens_to_keep > 1:
                # Set 'min_tokens_to_keep - 1' because the first token is kept
                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0
            # Keep the first token
            sorted_indices_to_remove = paddle.cast(
                sorted_indices_to_remove, dtype='int64')
            sorted_indices_to_remove[:, 1:] = (
                sorted_indices_to_remove[:, :-1].clone())
            sorted_indices_to_remove[:, 0] = 0

            # Scatter sorted tensors to original indexing
            sorted_indices = sorted_indices + paddle.arange(probs.shape[
                0]).unsqueeze(-1) * probs.shape[-1]
            condition = paddle.scatter(sorted_indices_to_remove.flatten(),
                                       sorted_indices.flatten(),
                                       sorted_indices_to_remove.flatten())
            condition = paddle.cast(condition, 'bool').reshape(probs.shape)
            probs = paddle.where(condition,
                                 paddle.full_like(probs, 0.0), probs)
            return probs

        batch_size, cur_len = input_ids.shape
        origin_len = input_ids.shape[1]
        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
        scores = paddle.full(
            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())

        # use_cache is immutable, we split it off other mutable kwargs.
        assert 'use_cache' in model_kwargs
        immutable = {'use_cache': model_kwargs['use_cache']}
        del model_kwargs['use_cache']

        def _forward_(**args):
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **args, **immutable)
            return self.gpt(**model_inputs, **immutable)

        def _post_process_(outputs, input_ids, cur_len, origin_len, scores,
                           unfinished_flag, model_kwargs):

            logits = outputs[0] if isinstance(outputs, tuple) else outputs

            logits = parallel_matmul(
                logits,
                get_attr(self.gpt.embeddings.word_embeddings, "weight"), False)

            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = logits_processors(input_ids, logits)

            # sample
            origin_probs = F.softmax(logits)
            origin_probs = paddle.log(origin_probs)
            if temperature is not None and temperature != 1.0:
                logits = logits / temperature
            probs = F.softmax(logits)
            if top_k is not None and top_k != 0:
                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
            if top_p is not None and top_p < 1.0:
                probs = TopPProcess(probs, top_p, min_tokens_to_keep)
            next_tokens = paddle.multinomial(probs)

            next_scores = paddle.index_sample(origin_probs, next_tokens)

            if eos_token_id is not None:
                next_tokens = paddle.where(
                    unfinished_flag, next_tokens,
                    paddle.full_like(next_tokens, pad_token_id))

            scores = self.update_scores_for_generation(
                scores, next_scores, cur_len - origin_len, unfinished_flag)

            input_ids = paddle.concat([input_ids, next_tokens], axis=1)

            if eos_token_id is not None:
                unfinished_flag = paddle.logical_and(
                    unfinished_flag, next_tokens != eos_token_id)

            model_kwargs = self.update_model_kwargs_for_generation(
                outputs,
                model_kwargs,
                is_encoder_decoder=self.is_encoder_decoder)

            return input_ids, scores, unfinished_flag, model_kwargs

        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement
        # the value in model_kwargs should be tensor before while loop
        outputs = _forward_(**model_kwargs)

        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
            outputs, input_ids, cur_len, origin_len, scores, unfinished_flag,
            model_kwargs)
        cur_len += 1

        attn_mask = model_kwargs['attention_mask']
        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
        model_kwargs['attention_mask'] = paddle.reshape(
            attn_mask, paddle.shape(attn_mask))
        model_kwargs['cache'] = outputs[1] if isinstance(outputs,
                                                         tuple) else None
        while cur_len < max_length:
            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs)
            # and change it to pass directly to _post_process_ to avoid
            # closed-loop problem of dynamic-to-static model
            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
                _forward_(**model_kwargs), input_ids, cur_len, origin_len,
                scores, unfinished_flag, model_kwargs)
            cur_len += 1

            if not paddle.any(unfinished_flag):
                break

        return input_ids[:, origin_len:], scores

    def forward(self, input_ids=None, **model_kwargs):

        max_length = self.max_length
        min_length = self.min_length
        decode_strategy = self.decode_strategy
        temperature = self.temperature
        top_k = self.top_k
        top_p = self.top_p
        repetition_penalty = self.repetition_penalty
        num_beams = self.num_beams
        num_beam_groups = self.num_beam_groups
        length_penalty = self.length_penalty
        early_stopping = self.early_stopping
        bos_token_id = self.bos_token_id
        eos_token_id = self.eos_token_id
        pad_token_id = self.pad_token_id
        decoder_start_token_id = self.decoder_start_token_id
        forced_bos_token_id = self.forced_bos_token_id
        forced_eos_token_id = self.forced_eos_token_id
        num_return_sequences = self.num_return_sequences
        diversity_rate = self.diversity_rate
        use_cache = self.use_cache

        assert (
            decode_strategy in ["greedy_search", "sampling", "beam_search"]
        ), "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format(
            decode_strategy)

        bos_token_id = bos_token_id if bos_token_id is not None else getattr(
            self.gpt, 'bos_token_id', None)
        eos_token_id = eos_token_id if eos_token_id is not None else getattr(
            self.gpt, 'eos_token_id', None)
        pad_token_id = pad_token_id if pad_token_id is not None else getattr(
            self.gpt, 'pad_token_id', None)
        forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr(
            self.gpt, 'forced_bos_token_id', None)
        forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr(
            self.gpt, 'forced_eos_token_id', None)
        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr(
            self.gpt, 'decoder_start_token_id', None)

        # params check
        if input_ids is None:
            # Init `input_ids` with bos_token_id
            input_ids = self.prepare_input_ids_for_generation(bos_token_id)

        if model_kwargs.get("attention_mask", None) is None:
            # TODO
            # Init `attention_mask` depending on `pad_token_id`
            model_kwargs[
                "attention_mask"] = self.prepare_attention_mask_for_generation(
                    input_ids, pad_token_id, eos_token_id)
        self.is_encoder_decoder = False

        model_kwargs["use_cache"] = use_cache

        max_length += input_ids.shape[-1]
        min_length += input_ids.shape[-1]

        logits_processors = self.get_logits_processor(
            min_length=min_length,
            max_length=max_length,
            eos_token_id=eos_token_id,
            forced_bos_token_id=forced_bos_token_id,
            forced_eos_token_id=forced_eos_token_id,
            num_beams=num_beams,
            num_beam_groups=num_beam_groups,
            diversity_rate=diversity_rate,
            repetition_penalty=repetition_penalty)

        if decode_strategy == 'sampling':
            if num_return_sequences > 1:
                input_ids, model_kwargs = self.expand_inputs_for_generation(
                    input_ids,
                    expand_size=num_return_sequences,
                    **model_kwargs)

            ret = self.sample(input_ids, logits_processors, max_length,
                              pad_token_id, eos_token_id, top_k, top_p,
                              temperature, **model_kwargs)
        else:
            raise ValueError(f'Not support {decoding_strategy} strategy yet!')
        return ret


def get_triangle_upper_mask(x, mask):
    if mask is not None:
        return mask
    mask = paddle.full_like(x, -np.inf)
    mask.stop_gradient = True
    mask = paddle.triu(mask, diagonal=1)
    mask.stop_gradient = True
    return mask


class ConcatSoftmaxInput(PyLayer):
    @staticmethod
    def forward(ctx, inp, group=None):
        inputs = []
        paddle.distributed.all_gather(inputs, inp, group=group)
        with paddle.no_grad():
            cat = paddle.concat(inputs, axis=-1)
        ctx.cat_args = group
        return cat

    @staticmethod
    def backward(ctx, grad):
        group = ctx.cat_args
        with paddle.no_grad():
            grads = paddle.split(
                grad, paddle.distributed.get_world_size(group), axis=-1)
        grad = grads[paddle.distributed.get_rank(group)]
        return grad


================================================
FILE: ppfleetx/models/language_model/gpt/dygraph/processor.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
import inspect
from abc import ABC

import paddle


class LogitsProcessorList(List):
    def __call__(self, input_ids, logits, **kwargs):
        for processor in self:
            processor_args = inspect.signature(processor.__call__).parameters
            if len(processor_args) > 2:
                assert all(
                    arg in kwargs for arg in list(processor_args.keys())[2:]
                ), f"The parameters don't match for {processor.__class__}"
                logits = processor(input_ids, logits, **kwargs)
            else:
                logits = processor(input_ids, logits)
        return logits


class LogitsProcessor(ABC):
    """
    Abstract base class for all logit processors that can be applied during 
    generation.
    """

    def __call__(self, input_ids, logits):
        raise NotImplementedError(
            f"{self.__class__} is an abstract class. "
            "Only classes inheriting this class can be called.")


class MinLengthLogitsProcessor(LogitsProcessor):
    r"""
    Enforcing a min-length by setting EOS probability to 0.
    Args:
        min_length (int): The minimum length of generation sequence.
        eos_token_id (int): The id of the `end-of-sequence` token.
    """

    def __init__(self, min_length, eos_token_id):
        if not isinstance(min_length, int) or min_length < 0:
            raise ValueError(
                "`min_length` should be a positive integer, but get {}".format(
                    min_length))

        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(
                "`eos_token_id` should be a positive integer, but get {}".
                format(eos_token_id))

        self.min_length = min_length
        self.eos_token_id = eos_token_id

    def __call__(self, input_ids, logits):
        cur_len = input_ids.shape[-1]
        if cur_len < self.min_length:
            logits[:, self.eos_token_id] = -float("inf")
        return logits


class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    Enforcing an exponential penalty on repeated sequences.
    Args:
        repetition_penalty (float):
            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
    """

    def __init__(self, penalty: float):
        if not isinstance(penalty, float) or not (penalty > 0):
            raise ValueError(
                f"`penalty` has to be a strictly positive float, but is {penalty}"
            )

        self.penalty = penalty

    def __call__(self, input_ids, logits):
        score = paddle.index_sample(logits, input_ids)
        score = paddle.where(score < 0, score * self.penalty,
                             score / self.penalty)
        input_ids = input_ids + paddle.arange(logits.shape[0]).unsqueeze(
            -1) * logits.shape[-1]
        outputs = paddle.scatter(logits.flatten(),
                                 input_ids.flatten(),
                                 score.flatten()).reshape(logits.shape)
        return outputs


class HammingDiversityLogitsProcessor(LogitsProcessor):
    """
    This `LogitsProcessor` enforces diverse beam search. Note that this logits
    processor is only effective for `group_beam_search`. See 
    `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
    Args:
        diversity_rate (float): This value is subtracted from a beam's score if 
            it generates a token same as any beam from other group at a particular 
            time. 
        num_beams (int): Number of beams used for group beam search. 
        num_beam_groups (int): Number of groups to divide `num_beams` into in order 
            to ensure diversity among different groups of beams. 
    """

    def __init__(self, diversity_rate, num_beams, num_beam_groups):
        if not isinstance(diversity_rate, float) or (not diversity_rate > 0.0):
            raise ValueError(
                "`diversity_rate` should be a float strictly larger than 0.")
        self._diversity_rate = diversity_rate
        if not isinstance(num_beams, int) or num_beams < 2:
            raise ValueError(
                "`num_beams` should be an integer strictly larger than 1.")
        self._num_beams = num_beams
        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
            raise ValueError(
                "`num_beam_groups` should be an integer strictly larger than 1."
            )
        self._num_sub_beams = num_beams // num_beam_groups

    def __call__(self, input_ids, scores, current_tokens, beam_group_idx):
        batch_size = current_tokens.shape[0] // self._num_beams
        group_start_idx = beam_group_idx * self._num_sub_beams
        group_end_idx = min(group_start_idx + self._num_sub_beams,
                            self._num_beams)
        group_size = group_end_idx - group_start_idx
        vocab_size = scores.shape[-1]

        if group_start_idx == 0:
            return scores

        for batch_idx in range(batch_size):
            previous_group_tokens = current_tokens[
                batch_idx * self._num_beams:batch_idx * self._num_beams +
                group_start_idx]
            token_frequency = paddle.bincount(
                previous_group_tokens, minlength=vocab_size)
            scores[batch_idx * group_size:(batch_idx + 1) *
                   group_size] -= self._diversity_rate * token_frequency

        return scores


class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
    """
    This `LogitsProcessor` enforces the first generated token to be the selected `forced_bos_token`.
    Args:
        forced_bos_token_id (:obj:`int`):
            The id of the token to to be generated as the first token.
    """

    def __init__(self, forced_bos_token_id):
        self.forced_bos_token_id = forced_bos_token_id

    def __call__(self, input_ids, scores):
        cur_len = input_ids.shape[-1]
        if cur_len == 1:
            num_tokens = scores.shape[1]
            scores[:, [
                i for i in range(num_tokens) if i != self.forced_bos_token_id
            ]] = -float("inf")
            scores[:, self.forced_bos_token_id] = 0
        return scores


class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
    """
    This `LogitsProcessor` enforces the last generated token to be the selected `forced_eos_token`.
    Args:
        max_length (int): The maximum length of the sequence to be generated.
        forced_eos_token_id (int): The id of the token to to be generated as the last token.
    """

    def __init__(self, max_length, forced_eos_token_id):
        self.max_length = max_length
        self.forced_eos_token_id = forced_eos_token_id

    def __call__(self, input_ids, scores):
        cur_len = input_ids.shape[-1]
        if cur_len == self.max_length - 1:
            num_tokens = scores.shape[1]
            scores[:, [
                i for i in range(num_tokens) if i != self.forced_eos_token_id
            ]] = -1e9  #TODO change back to -inf after paddle.topk is fixed
            scores[:, self.forced_eos_token_id] = 0
        return scores


================================================
FILE: ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
from paddle import framework
from paddle import distributed as dist
from paddle.nn import functional as F
from paddle.autograd import PyLayer
from paddle.fluid import core
from paddle.nn.layer.layers import Layer
from paddle.distributed import fleet
from paddle.distributed.fleet.base import topology as tp
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients_with_group

from ppfleetx.distributed.apis import env

import numpy as np

####################################################
#                                                  #
#        Distributed Communication Operator        #
#                                                  #
####################################################


def scatter(input):
    hcg = env.get_hcg()
    group = hcg.get_model_parallel_group()
    parallelism = group.nranks
    rank = group.rank
    seq_len = input.shape[0]
    assert seq_len % parallelism == 0, "Input sequence length {} can't be divided exactly by sequence parallelism {}".format(
        seq_len, parallelism)
    interval = seq_len // parallelism
    input = paddle.slice(
        input,
        axes=[0],
        starts=[interval * rank],
        ends=[interval * (rank + 1)])
    return input


def all_gather(input):
    hcg = env.get_hcg()
    group = hcg.get_model_parallel_group()
    parallelism = group.nranks
    output_shape = input.shape
    output_shape[0] = output_shape[0] * parallelism
    output = paddle.empty(shape=output_shape, dtype=input.dtype)
    group.process_group.all_gather(input, output).wait()
    return output


def reduce_scatter(input):
    hcg = env.get_hcg()
    group = hcg.get_model_parallel_group()
    parallelism = group.nranks
    output_shape = input.shape
    assert input.shape[
        0] % parallelism == 0, "Input sequence length {0} can't be divided exactly by sequence parallelism {1}".format(
            input.shape[0], parallelism)
    output_shape[0] = output_shape[0] // parallelism
    output = paddle.empty(shape=output_shape, dtype=input.dtype)
    dist.stream.reduce_scatter(
        output, input, op=dist.ReduceOp.SUM, group=group, sync_op=True)
    return output


class ScatterOp(PyLayer):
    # input shape: [s, b, h], n is mp parallelism
    # after forward shape: [s/n, b, h]
    @staticmethod
    def forward(ctx, input):
        return scatter(input)

    @staticmethod
    def backward(ctx, grad):
        return all_gather(grad)


class GatherOp(PyLayer):
    # input shape: [s/n, b, h], n is mp parallelism
    # after forward shape: [s, b, h]
    @staticmethod
    def forward(ctx, input):
        return all_gather(input)

    @staticmethod
    def backward(ctx, grad):
        return scatter(grad)


# All gather along the first dim during forward pass
# All reduce and scatter along the first dim during backward pass
class AllGatherOp(PyLayer):
    # input shape: [s/n, b, h], n is mp parallelism
    # after forward shape: [s, b, h]
    @staticmethod
    def forward(ctx, input):
        return all_gather(input)

    # grad shape: [s, b, h], n is mp parallelism
    # after forward shape: [s/n, b, h]
    @staticmethod
    def backward(ctx, grad):
        return reduce_scatter(grad)


# All reduce and scatter along the first dim during forward pass
# All gather along the first dim during backward pass
class ReduceScatterOp(PyLayer):
    # input shape: [s, b, h], n is mp parallelism
    # after forward shape: [s/n, b, h]
    @staticmethod
    def forward(ctx, input):
        return reduce_scatter(input)

    # grad shape: [s/n, b, h], n is mp parallelism
    # after forward shape: [s, b, h]
    @staticmethod
    def backward(ctx, grad):
        return all_gather(grad)


###################################################
#                                                 #
#        Modified Parallel Linear Operator        #
#                                                 #
###################################################


def mark_as_sequence_parallel_parameter(parameter):
    setattr(parameter, 'sequence_parallel', True)


def is_sequence_parallel_parameter(parameter):
    return getattr(parameter, 'sequence_parallel', False)


def create_fused_allreduce_gradient_hook(parameter_list, accumulation_steps):
    hcg = env.get_hcg()
    group = hcg.get_model_parallel_group()

    step = [0]
    accumulation_steps *= len(parameter_list)

    def __impl__(grad):
        step[0] += 1
        if step[0] == accumulation_steps:
            step[0] = 0
            fused_allreduce_gradients_with_group(
                parameter_list, group=group, scale=1.0)
        return grad

    return __impl__


def create_non_fused_allreduce_gradient_hook(param, accumulation_steps):
    hcg = env.get_hcg()
    pg = hcg.get_model_parallel_group().process_group
    step = [0]

    @paddle.autograd.no_grad()
    def __impl__():
        step[0] += 1
        if (step[0] % accumulation_steps) == 0:
            if hasattr(param, "main_grad"):
                pg.allreduce(param.main_grad).wait()
            else:
                pg.allreduce(param.grad).wait()

    return __impl__


def register_sequence_parallel_allreduce_hooks(
        model, accumulation_steps, fuse_sequence_parallel_allreduce):
    if accumulation_steps <= 0 or not paddle.distributed.is_initialized():
        return

    mp_group = env.get_hcg().get_model_parallel_group()
    if mp_group.nranks <= 1:
        return

    params = []
    for p in model.parameters():
        if is_sequence_parallel_parameter(p):
            params.append(p)

    if fuse_sequence_parallel_allreduce:
        hook = create_fused_allreduce_gradient_hook(params, accumulation_steps)
        for p in params:
            p._register_backward_hook(hook)
    else:
        for p in params:
            hook = create_non_fused_allreduce_gradient_hook(p,
                                                            accumulation_steps)
            p._register_backward_hook(hook)


def is_fused_matmul_bias_supported():
    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')
    else:
        return False


class ColumnSequenceParallelLinear(Layer):
    def __init__(self,
                 in_features,
                 out_features,
                 weight_attr=None,
                 has_bias=None,
                 gather_output=True,
                 fuse_matmul_bias=False,
                 mp_group=None,
                 name=None):
        super(ColumnSequenceParallelLinear, self).__init__()

        hcg = env.get_hcg()
        self.model_parallel_group = hcg.get_model_parallel_group(
        ) if mp_group is None else mp_group
        self.world_size = hcg.get_model_parallel_group(
        ).nranks if mp_group is None else mp_group.nranks
        self._name = name
        self.is_mp = (self.world_size > 1)

        assert gather_output is False, "If sequence_parallel is True, \
                                        gather_output is False"

        self.gather_output = gather_output
        assert out_features % self.world_size == 0, (
            "Number of column of the weight for linear ({}) must be"
            " divisible by model parallel size ({})".format(out_features,
                                                            self.world_size))
        self.output_size_per_partition = out_features // self.world_size

        self._weight_attr = weight_attr
        self._dtype = self._helper.get_default_dtype()

        if self.is_mp and paddle.in_dynamic_mode():
            with get_rng_state_tracker().rng_state():
                self.weight = self.create_parameter(
                    shape=[in_features, self.output_size_per_partition],
                    attr=self._weight_attr,
                    dtype=self._dtype,
                    is_bias=False)
        else:
            self.weight = self.create_parameter(
                shape=[in_features, self.output_size_per_partition],
                attr=self._weight_attr,
                dtype=self._dtype,
                is_bias=False)

        self.weight.is_distributed = True if self.is_mp else False

        if has_bias:
            # initialize bias to zero like Megatron
            self.bias = self.create_parameter(
                shape=[self.output_size_per_partition],
                attr=paddle.nn.initializer.Constant(value=0.0),
                dtype=self._dtype,
                is_bias=True)
            self.bias.is_distributed = True if self.is_mp else False
        else:
            self.bias = None

        self.linear = F.linear

        if fuse_matmul_bias:
            if not is_fused_matmul_bias_supported():
                raise NotImplementedError(
                    "You set fuse_matmul_bias=True in ColumnSequenceParallelLinear, "
                    "however, the paddle you are using not support this operation. "
                    "Please set fuse_matmul_bias=False or use paddle compiled "
                    "with cuda 11.6 or higher.")
            from paddle.incubate.nn.functional import fused_linear
            self.linear = fused_linear

    def forward(self, x):
        # sequence parallelism is same as model parallelism
        # if sequence parallel is true, input shape is [s, b, h]
        # else input shape is [b, s, h]
        if self.is_mp:
            input_parallel = AllGatherOp.apply(x)
        else:
            input_parallel = x
        output = self.linear(
            input_parallel, self.weight, self.bias, name=self._name)
        return output


class RowSequenceParallelLinear(Layer):
    def __init__(self,
                 in_features,
                 out_features,
                 weight_attr=None,
                 has_bias=True,
                 input_is_parallel=False,
                 fuse_matmul_bias=False,
                 mp_group=None,
                 name=None):
        super(RowSequenceParallelLinear, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        assert input_is_parallel is True, "If sequence_parallel is True, \
                                           input_is_parallel should be true."

        self.input_is_parallel = input_is_parallel
        self._weight_attr = weight_attr
        self._dtype = self._helper.get_default_dtype()
        self._name = name

        hcg = env.get_hcg()
        self.model_parallel_group = hcg.get_model_parallel_group(
        ) if mp_group is None else mp_group
        self.world_size = hcg.get_model_parallel_group(
        ).nranks if mp_group is None else mp_group.nranks
        self.rank = hcg.get_model_parallel_group(
        ).rank if mp_group is None else mp_group.rank

        self.is_mp = (self.world_size > 1)
        assert in_features % self.world_size == 0, (
            "Number of row of the weight for linear ({}) must be"
            " divisible by model parallel size ({})".format(in_features,
                                                            self.world_size))

        self.input_size_per_partition = in_features // self.world_size

        if self.is_mp and paddle.in_dynamic_mode():
            with get_rng_state_tracker().rng_state():
                self.weight = self.create_parameter(
                    shape=[self.input_size_per_partition, self.out_features],
                    attr=self._weight_attr,
                    dtype=self._dtype,
                    is_bias=False)
        else:
            self.weight = self.create_parameter(
                shape=[self.input_size_per_partition, self.out_features],
                attr=self._weight_attr,
                dtype=self._dtype,
                is_bias=False)

        self.weight.is_distributed = True if self.is_mp else False

        # if sequence parallel is true,
        # register hook to all_reduce gradient of weight and bias
        if has_bias:
            self.bias = self.create_parameter(
                shape=[self.out_features],
                attr=paddle.nn.initializer.Constant(value=0.0),
                dtype=self._dtype,
                is_bias=True)
            if self.is_mp:
                mark_as_sequence_parallel_parameter(self.bias)
        else:
            self.bias = None

        self.linear = F.linear

        if fuse_matmul_bias:
            if not is_fused_matmul_bias_supported():
                raise NotImplementedError(
                    "You set fuse_matmul_bias=True in RowParallelLinear, "
                    "however, the paddle you are using not support this operation. "
                    "Please set fuse_matmul_bias=False or use paddle compiled "
                    "with cuda 11.6 or higher.")
            from paddle.incubate.nn.functional import fused_linear
            self.linear = fused_linear

    def forward(self, x):
        input_parallel = x
        if self.is_mp:
            output_parallel = self.linear(
                input_parallel, self.weight, name=self._name)
            output_ = ReduceScatterOp.apply(output_parallel)
            # if self.bias is not none, sequence parallel will use
            # register_hook to all_reduce self.bias
            output = output_ + self.bias if self.bias is not None else output_
        else:
            output = self.linear(
                input_parallel, self.weight, self.bias, name=self._name)
        return output


================================================
FILE: ppfleetx/models/language_model/gpt/dygraph/single_model.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections
import logging
from distutils.util import strtobool
import os
import numpy as np
import math

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.tensor as tensor
from paddle.fluid import layers
from paddle.nn.layer.transformer import _convert_param_attr_to_list
from paddle.common_ops_import import convert_dtype
import paddle.incubate as incubate
from paddle.distributed.fleet.utils import recompute
from paddle.incubate.nn import FusedLinear
from .processor import (
    LogitsProcessorList, MinLengthLogitsProcessor,
    HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor,
    ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor)

from ppfleetx.models.language_model.moe import MoELayer
from ppfleetx.models.language_model.moe_exp.layer import MoE

from ppfleetx.utils.log import logger
try:
    from paddle.nn.functional.flash_attention import flash_attention
except:
    flash_attention = None


def get_attr(layer, name):
    if getattr(layer, name, None) is not None:
        return getattr(layer, name, None)
    else:
        return get_attr(layer._layer, name)


class ExpertLayer(nn.Layer):
    def __init__(self, d_model, d_hidden, name=None):
        super(ExpertLayer, self).__init__()

        self.htoh4 = nn.Linear(
            d_model,
            d_hidden,
            weight_attr=nn.initializer.KaimingUniform(),
            bias_attr=nn.initializer.Constant(value=0.0))
        self.h4toh = nn.Linear(
            d_hidden,
            d_model,
            weight_attr=nn.initializer.KaimingUniform(),
            bias_attr=nn.initializer.Constant(value=0.0))

        self.htoh4.weight.name = "expert_" + self.htoh4.weight.name
        self.h4toh.weight.name = "expert_" + self.h4toh.weight.name
        self.htoh4.bias.name = "expert_" + self.htoh4.bias.name
        self.h4toh.bias.name = "expert_" + self.h4toh.bias.name

    def forward(self, x):
        x = self.htoh4(x)
        x = F.gelu(x, approximate=True)
        x = self.h4toh(x)
        return x


class MultiHeadAttention(nn.Layer):
    """
    Attention mapps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.

    """

    Cache = collections.namedtuple("Cache", ["k", "v"])
    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])

    def __init__(self,
                 embed_dim,
                 num_heads,
                 dropout=0.,
                 kdim=None,
                 vdim=None,
                 need_weights=False,
                 weight_attr=None,
                 bias_attr=None,
                 output_layer_weight_attr=None,
                 fuse_attn_qkv=False,
                 scale_qk_coeff=1.0,
                 fused_linear=False,
                 use_recompute=False,
                 recompute_granularity="full",
                 do_recompute=True,
                 use_flash_attn=False):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.need_weights = need_weights
        self.fuse_attn_qkv = fuse_attn_qkv
        self.scale_qk_coeff = scale_qk_coeff
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.do_recompute = do_recompute
        self.use_flash_attn = use_flash_attn if flash_attention else None

        self.head_dim = embed_dim // num_heads
        assert self.head_dim * \
            num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        Linear = FusedLinear if fused_linear else nn.Linear

        if self.fuse_attn_qkv:
            assert self.kdim == embed_dim
            assert self.vdim == embed_dim
            self.qkv_proj = Linear(
                embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr)
        else:
            self.q_proj = Linear(
                embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
            self.k_proj = Linear(
                self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
            self.v_proj = Linear(
                self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)

        self.out_proj = Linear(
            embed_dim,
            embed_dim,
            output_layer_weight_attr,
            bias_attr=bias_attr)

    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):
        mix_layer = self.qkv_proj(query)
        mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim])
        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)

        assert not isinstance(
            cache, self.StaticCache
        ), "cache currently does not support the StaticCache type"

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=1)
            v = tensor.concat([cache.v, v], axis=1)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v, cache) if use_cache else (q, k, v, None)

    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.

        """
        q = self.q_proj(query)
        q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim])

        if isinstance(cache, self.StaticCache):
            # for encoder-decoder attention in inference and has cached
            k, v = cache.k, cache.v
        else:
            k, v = self.compute_kv(key, value)

        if isinstance(cache, self.Cache):
            # for decoder self-attention in inference
            k = tensor.concat([cache.k, k], axis=1)
            v = tensor.concat([cache.v, v], axis=1)
        if use_cache is True:
            cache = self.Cache(k, v)

        return (q, k, v, cache) if use_cache else (q, k, v, None)

    def compute_kv(self, key, value):
        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
        parallel attention.

        It is part of calculations in multi-head attention, and is provided as
        a method to pre-compute and prefetch these results, thus we can use them
        to construct cache for inference.

        """
        k = self.k_proj(key)
        v = self.v_proj(value)
        k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim])
        v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim])
        return k, v

    def gen_cache(self, key, value=None, type=Cache):
        """
        Generates cache for `forward` usage in inference accroding to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.
        """
        if type == MultiHeadAttention.StaticCache:  # static_kv
            k, v = self.compute_kv(key, value)
            return self.StaticCache(k, v)
        elif value is None:  # incremental_state
            k = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            v = layers.fill_constant_batch_size_like(
                input=key,
                shape=[-1, self.num_heads, 0, self.head_dim],
                dtype=key.dtype,
                value=0)
            return self.Cache(k, v)
        else:
            # incremental_state with initial value, mainly for usage like UniLM
            return self.Cache(key, value)

    def _flash_attention(self, q, k, v, attn_mask=None):
        out, weights = flash_attention(
            q,
            k,
            v,
            self.dropout,
            causal=True,
            return_softmax=self.need_weights)
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
        return out, weights

    def core_attn(self, q, k, v, attn_mask=None):
        perm = [0, 2, 1, 3]
        q = tensor.transpose(x=q, perm=perm)
        k = tensor.transpose(x=k, perm=perm)
        v = tensor.transpose(x=v, perm=perm)

        # scale dot product attention
        scale_qk_coeff = self.scale_qk_coeff * self.head_dim**0.5
        product = paddle.matmul(
            x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)

        if self.scale_qk_coeff != 1.0:
            product = product.scale(self.scale_qk_coeff)

        if attn_mask is not None:
            product = product + attn_mask
            weights = F.softmax(product)
        else:
            weights = incubate.softmax_mask_fuse_upper_triangle(product)

        if self.dropout:
            weights = F.dropout(
                weights,
                self.dropout,
                training=self.training,
                mode="upscale_in_train")

        out = paddle.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, -1])

        return out, weights

    def forward(self,
                query,
                key,
                value,
                attn_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if self.fuse_attn_qkv:
            q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                               cache)

        if self.use_recompute and self.recompute_granularity == "core_attn" and self.do_recompute:
            out, weights = recompute(self.core_attn, q, k, v, attn_mask)
        elif self.use_flash_attn and attn_mask is None:
            out, weights = self._flash_attention(q, k, v)
        else:
            out, weights = self.core_attn(q, k, v, attn_mask=attn_mask)

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if use_cache:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)


class TransformerDecoder(nn.Layer):
    """
    TransformerDecoder is a stack of N decoder layers.
    """

    def __init__(self,
                 decoder_layers,
                 num_layers,
                 norm=None,
                 hidden_size=None,
                 use_recompute=False,
                 recompute_granularity="full",
                 no_recompute_layers=None):
        super(TransformerDecoder, self).__init__()

        if no_recompute_layers is None:
            no_recompute_layers = []
        self.no_recompute_layers = no_recompute_layers

        self.num_layers = num_layers
        self.layers = decoder_layers
        self.norm = norm
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        if norm == "LayerNorm":
            self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
        elif norm is not None:
            raise ValueError("Only support LayerNorm")

    def forward(self,
                tgt,
                memory,
                tgt_mask=None,
                memory_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last decoder
        layer.
        """
        output = tgt
        new_caches = []

        for i, mod in enumerate(self.layers):
            if cache is None:
                if use_cache:
                    output, new_cache = mod(output,
                                            memory,
                                            tgt_mask=tgt_mask,
                                            use_cache=use_cache,
                                            cache=cache)
                    new_caches.append(new_cache)
                else:
                    if self.use_recompute and self.recompute_granularity == "full" and i not in self.no_recompute_layers:
                        output = recompute(mod, output, memory, tgt_mask,
                                           use_cache, cache)
                    else:
                        output = mod(output, memory, tgt_mask, use_cache,
                                     cache)
            else:
                output, new_cache = mod(output,
                                        memory,
                                        tgt_mask=tgt_mask,
                                        use_cache=use_cache,
                                        cache=cache[i])
                new_caches.append(new_cache)

        if self.norm is not None:
            output = self.norm(output)
        return output if use_cache is False else (output, new_caches)

    def gen_cache(self, memory, do_zip=False):
        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
        for more details. If `do_zip` is True, apply `zip` on these tuples to get
        a list with two elements.
       """
        cache = [layer.gen_cache(memory) for layer in self.layers]
        if do_zip:
            cache = list(zip(*cache))
        return cache


class TransformerDecoderLayer(nn.Layer):
    """
    The transformer decoder layer.

    It contains multiheadattention and some linear layers.
    """

    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 num_experts=1,
                 dropout=0.1,
                 activation="gelu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=True,
                 topk=1,
                 moe_use_residual=False,
                 moe_train_capacity_factor=1.0,
                 moe_eval_capacity_factor=1.0,
                 moe_min_capacity=4,
                 moe_token_dropping=True,
                 enable_expert_tensor_parallelism=False,
                 weight_attr=None,
                 bias_attr=None,
                 output_layer_weight_attr=None,
                 fused_linear=False,
                 fuse_attn_qkv=False,
                 scale_qk_coeff=1.0,
                 use_recompute=False,
                 recompute_granularity="full",
                 do_recompute=True,
                 skip_quant_tensors=[],
                 use_flash_attn=False):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        attn_dropout = dropout if attn_dropout is None else attn_dropout
        act_dropout = dropout if act_dropout is None else act_dropout
        self.normalize_before = normalize_before
        self.use_recompute = use_recompute
        self.recompute_granularity = recompute_granularity
        self.do_recompute = do_recompute

        self.num_experts = num_experts

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
        output_layer_weight_attrs = _convert_param_attr_to_list(
            output_layer_weight_attr, 3)

        Linear = FusedLinear if fused_linear else nn.Linear

        self.self_attn = MultiHeadAttention(
            d_model,
            nhead,
            dropout=attn_dropout,
            weight_attr=weight_attrs[0],
            bias_attr=bias_attrs[0],
            output_layer_weight_attr=output_layer_weight_attrs[0],
            fused_linear=fused_linear,
            fuse_attn_qkv=fuse_attn_qkv,
            scale_qk_coeff=scale_qk_coeff,
            use_recompute=use_recompute,
            recompute_granularity=recompute_granularity,
            do_recompute=do_recompute,
            use_flash_attn=use_flash_attn)

        self.moe_mlp = None
        if self.num_experts > 1:
            assert (topk == 1, "Only support topk=1 currently.")
            self.moe_mlp = MoE(
                d_model,
                ExpertLayer(d_model, dim_feedforward),
                self.num_experts,
                ep_size=1,
                k=topk,
                use_residual=moe_use_residual,
                capacity_factor=moe_train_capacity_factor,
                eval_capacity_factor=moe_eval_capacity_factor,
                min_capacity=moe_min_capacity,
                drop_tokens=moe_token_dropping,
                enable_expert_tensor_parallelism=enable_expert_tensor_parallelism
            )
        else:
            self.linear1 = Linear(
                d_model,
                dim_feedforward,
                weight_attrs[2],
                bias_attr=bias_attrs[2])
            self.linear2 = Linear(
                dim_feedforward,
                d_model,
                output_layer_weight_attrs[2],
                bias_attr=bias_attrs[2])

            if 'linear1' in skip_quant_tensors:
                self.linear1.skip_quant = True

            if 'linear2' in skip_quant_tensors:
                self.linear2.skip_quant = True

        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
        if activation == 'gelu':
            self.activation = nn.GELU(approximate=True)
        else:
            self.activation = getattr(F, activation)

    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
        residual = tgt

        if self.normalize_before:
            tgt = self.norm1(tgt)

        if use_cache is False:
            if self.use_recompute and self.recompute_granularity == "full_attn" and self.do_recompute:
                tgt = recompute(self.self_attn, tgt, None, None, tgt_mask,
                                use_cache, cache)
            else:
                tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
        else:
            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
                                                    use_cache, cache)
        tgt = residual + self.dropout1(tgt)
        if not self.normalize_before:
            tgt = self.norm1(tgt)
        residual = tgt
        if self.normalize_before:
            tgt = self.norm2(tgt)

        # if self.expert_mode:
        #     tgt = self.moe_mlp(tgt)
        if self.num_experts > 1:
            tgt = self.moe_mlp(tgt)
        else:
            tgt = self.dropout2(
                self.linear2(self.activation(self.linear1(tgt))))

        tgt = residual + tgt

        if not self.normalize_before:
            tgt = self.norm2(tgt)

        return tgt if use_cache is False else (tgt, incremental_cache)

    def gen_cache(self, memory):
        incremental_cache = self.self_attn.gen_cache(
            memory, type=self.self_attn.Cache)
        return incremental_cache


class GPTEmbeddings(nn.Layer):
    """
    Include embeddings from word and position embeddings.
    """

    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 hidden_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 freeze_embedding=False):
        super(GPTEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(
            vocab_size,
            hidden_size,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
                mean=0.0, std=initializer_range)))

        self.position_embeddings = nn.Embedding(
            max_position_embeddings,
            hidden_size,
            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
                mean=0.0, std=initializer_range)))

        if freeze_embedding:
            self.word_embeddings.weight.learning_rate = 0.0
            self.position_embeddings.weight.learning_rate = 0.0

        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, position_ids=None):
        if position_ids is None:
            ones = paddle.ones_like(input_ids, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=-1)
            position_ids = seq_length - ones

        input_embedings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        embeddings = input_embedings + position_embeddings
        embeddings = self.dropout(embeddings)
        return embeddings


class GPTModel(nn.Layer):
    def __init__(self,
                 vocab_size=51200,
                 hidden_size=768,
                 num_layers=12,
                 num_attention_heads=12,
                 ffn_hidden_size=3072,
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 use_recompute=False,
                 initializer_range=0.02,
                 num_experts=[1],
                 expert_interval=2,
                 topk=1,
                 moe_use_residual=False,
                 moe_train_capacity_factor=1.0,
                 moe_eval_capacity_factor=1.0,
                 moe_min_capacity=4,
                 moe_token_dropping=True,
                 enable_expert_tensor_parallelism=False,
                 fused_linear=False,
                 fuse_attn_qkv=False,
                 scale_qk_by_layer_num=True,
                 recompute_granularity="full",
                 sequence_parallel=False,
                 no_recompute_layers=None,
                 skip_tensor_map={},
                 freeze_embedding=False,
                 use_flash_attn=False,
                 fused_softmax_with_triangular=False):

        super(GPTModel, self).__init__()

        if no_recompute_layers is None:
            no_recompute_layers = []
        self.initializer_range = initializer_range
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.fused_softmax_with_triangular = fused_softmax_with_triangular

        if use_flash_attn:
            if flash_attention:
                logger.info("Flash-attention enabled.")
            else:
                use_flash_attn = False
                logger.warning(
                    "Flash-attention is not support in this Paddle version.")

        self.embeddings = GPTEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, self.initializer_range,
            freeze_embedding)

        assert len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval, \
            'num_experts must be either a single value or a list of the same length as the number of MoE layers'

        # Expand the list of MoE experts num to MoE layers num
        if len(num_experts) == 1:
            num_experts = num_experts * (num_layers // expert_interval)

        decoder_layers = nn.LayerList()
        for i in range(num_layers):
            # TODO: original layer_num = i + 1 + offset here
            layer_num = i + 1
            if layer_num % expert_interval == 0:
                n_e = num_experts[(layer_num - 1) // expert_interval]
            else:
                n_e = 1
            decoder_layers.append(
                TransformerDecoderLayer(
                    d_model=hidden_size,
                    nhead=num_attention_heads,
                    dim_feedforward=ffn_hidden_size,
                    num_experts=n_e,
                    dropout=hidden_dropout_prob,
                    activation="gelu",
                    attn_dropout=attention_probs_dropout_prob,
                    act_dropout=hidden_dropout_prob,
                    topk=topk,
                    moe_use_residual=moe_use_residual,
                    moe_train_capacity_factor=moe_train_capacity_factor,
                    moe_eval_capacity_factor=moe_eval_capacity_factor,
                    moe_min_capacity=moe_min_capacity,
                    moe_token_dropping=moe_token_dropping,
                    enable_expert_tensor_parallelism=enable_expert_tensor_parallelism,
                    weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
                            mean=0.0, std=self.initializer_range)),
                    output_layer_weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
                            mean=0.0,
                            std=self.initializer_range / math.sqrt(
                                2.0 * num_layers))),
                    bias_attr=None,
                    fused_linear=fused_linear,
                    fuse_attn_qkv=fuse_attn_qkv,
                    scale_qk_coeff=num_layers
                    if scale_qk_by_layer_num else 1.0,
                    use_recompute=use_recompute,
                    recompute_granularity=recompute_granularity,
                    do_recompute=i not in no_recompute_layers,
                    skip_quant_tensors=skip_tensor_map.get('block_{}'.format(
                        i), []),
                    use_flash_attn=use_flash_attn))

        self.decoder = TransformerDecoder(
            decoder_layers,
            num_layers,
            norm="LayerNorm",
            hidden_size=hidden_size,
            use_recompute=use_recompute,
            recompute_granularity=recompute_granularity,
            no_recompute_layers=no_recompute_layers)

    def forward(self,
                input_ids,
                position_ids=None,
                attention_mask=None,
                use_cache=False,
                cache=None):

        if position_ids is None:
            past_length = 0
            if cache is not None:
                past_length = paddle.shape(attention_mask)[-1] - 1
            position_ids = paddle.arange(
                past_length,
                paddle.shape(input_ids)[-1] + past_length,
                dtype=input_ids.dtype)
            position_ids = position_ids.unsqueeze(0)
            # .expand_as(input_ids)
            position_ids = paddle.expand_as(position_ids, input_ids)

        embedding_output = self.embeddings(
            input_ids=input_ids, position_ids=position_ids)

        # fused_softmax_with_triangular is only suppported on GPU/DCU.
        # If on non-GPU devices, we use user defined mask and non-fused softmax.
        if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda(
        ):
            # TODO, use registered buffer
            causal_mask = paddle.tensor.triu(
                paddle.ones(
                    (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1]))
                * -1e4,
                diagonal=1)
            if attention_mask is not None:
                if len(attention_mask.shape) == 2:
                    attention_mask = attention_mask[:, None, None, :]
                attention_mask = attention_mask + causal_mask
            else:
                attention_mask = causal_mask
            # The tensor returned by triu not in static graph.
            attention_mask.stop_gradient = True

        encoder_outputs = self.decoder(
            embedding_output,
            memory=None,
            tgt_mask=None if (self.fused_softmax_with_triangular and
                              self.training and paddle.is_compiled_with_cuda())
            else attention_mask,  # use softmax_mask_fuse_upper_triangle
            use_cache=use_cache,
            cache=cache)

        return encoder_outputs


class GPTForPretraining(nn.Layer):
    """
    GPT Model with pretraining tasks on top.

    Args:
        gpt (:class:`GPTModel`):
            An instance of :class:`GPTModel`.

    """

    def __init__(self, gpt):
        super(GPTForPretraining, self).__init__()
        self.gpt = gpt

    def forward(self,
                input_ids,
                position_ids=None,
                attention_mask=None,
                masked_positions=None,
                use_cache=False,
                cache=None):

        outputs = self.gpt(input_ids,
                           position_ids=position_ids,
                           attention_mask=attention_mask,
                           use_cache=use_cache,
                           cache=cache)
        if use_cache:
            encoder_outputs, cached_kvs = outputs[:2]
        else:
            encoder_outputs = outputs
        logits = paddle.matmul(
            encoder_outputs,
            get_attr(self.gpt.embeddings.word_embeddings, "weight"),
            transpose_y=True)

        if use_cache:
            return logits, cached_kvs
        else:
            return logits


class GPTPretrainingCriterion(nn.Layer):
    """
    Criterion for GPT. It calculates the final loss.
    """

    def __init__(self, topo=None):
        super(GPTPretrainingCriterion, self).__init__()
        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")

    def forward(self, prediction_scores, masked_lm_labels, loss_mask):
        """
        Args:
            prediction_scores(Tensor):
                The logits of masked token prediction. Its data type should be float32 and
                its shape is [batch_size, sequence_length, vocab_size].
            masked_lm_labels(Tensor):
                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`
                is equal to `prediction_scores`. Its data type should be int64 and
                its shape is [batch_size, sequence_length, 1].
            loss_mask(Tensor):
                Mask used for calculating the loss of the masked language modeling to avoid
                calculating some unwanted tokens.
                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].

        Returns:
            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].

        """
        masked_lm_loss = self.loss_func(prediction_scores,
                                        masked_lm_labels.unsqueeze(2))

        loss_mask = loss_mask.reshape([-1])
        masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
        loss = masked_lm_loss / loss_mask.sum()
        return loss


class GPTForSequenceClassification(nn.Layer):
    """
    GPT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
    for GLUE tasks.
    Args:
        gpt (:class:`GPTModel`):
            An instance of GPTModel.
        num_classes (int, optional):
            The number of classes. Defaults to `2`.

    """

    def __init__(self, gpt, num_classes=2):
        super(GPTForSequenceClassification, self).__init__()
        self.gpt = gpt
        self.score = nn.Linear(
            self.gpt.hidden_size, num_classes, bias_attr=False)

        from paddle.nn.initializer import Normal
        normal_ = Normal(std=self.gpt.initializer_range)
        normal_(self.score.weight)

    def forward(self, input_ids, position_ids=None, attention_mask=None):

        output = self.gpt(input_ids,
                          position_ids=position_ids,
                          attention_mask=attention_mask)

        logits = self.score(output)
        # padding index maybe 0
        eos_token_id = 0
        # sequence_lengths shape [bs,]
        sequence_lengths = (input_ids != eos_token_id).astype("int64").sum(
            axis=-1) - 1

        pooled_logits = logits.gather_nd(
            paddle.stack(
                [paddle.arange(output.shape[0]), sequence_lengths], axis=-1))

        return pooled_logits


class GPTForGeneration(nn.Layer):
    """
    GPT Model with pretraining tasks on top.

    Args:
        gpt (:class:`GPTModel`):
            An instance of :class:`GPTModel`.

    """

    def __init__(self, gpt, configs):
        super(GPTForGeneration, self).__init__()
        self.gpt = gpt
        self.configs = configs

        self.max_length = self.configs.get('max_dec_len', 20)
        self.min_length = self.configs.get('min_dec_len', 0)
        self.decode_strategy = self.configs.get('decode_strategy', 'sampling')
        self.temperature = self.configs.get('temperature', 1.0)
        self.top_k = self.configs.get('top_k', 0)
        self.top_p = self.configs.get('top_p', 1.0)
        self.use_topp_sampling = self.configs.get('use_topp_sampling', False)
        self.inference = self.configs.get('inference', False)
        self.repetition_penalty = self.configs.get('repetition_penalty', 1.0)
        self.num_beams = self.configs.get('num_beams', 1)
        self.num_beam_groups = self.configs.get('num_beam_groups', 1)
        self.length_penalty = self.configs.get('length_penalty', 0.0)
        self.early_stopping = self.configs.get('early_stopping', False)
        self.bos_token_id = self.configs.get('bos_token_id', None)
        self.eos_token_id = self.configs.get('eos_token_id', None)
        self.pad_token_id = self.configs.get('pad_token_id', None)
        self.decoder_start_token_id = self.configs.get(
            'decoder_start_token_id', None)
        self.forced_bos_token_id = self.configs.get('forced_bos_token_id',
                                                    None)
        self.forced_eos_token_id = self.configs.get('forced_eos_token_id',
                                                    None)
        self.num_return_sequences = self.configs.get('num_return_sequences', 1)
        self.diversity_rate = self.configs.get('diversity_rate', 0.0)
        self.use_cache = self.configs.get('use_cache', True)

    def prepare_input_ids_for_generation(self,
                                         bos_token_id,
                                         encoder_output=None):
        batch_size = 1
        if bos_token_id is None:
            raise ValueError("`bos_token_id` should be defined when no "
                             "`input_ids` are provided.")
        if encoder_output is not None:
            batch_size = encoder_output.shape[0]
        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id

    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id,
                                              eos_token_id):
        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(
            input_ids == pad_token_id).numpy().item()
        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
            (eos_token_id is not None) and (pad_token_id != eos_token_id))
        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
            attention_mask = (input_ids == pad_token_id
                              ).astype(paddle.get_default_dtype()) * -1e9
        else:
            attention_mask = paddle.zeros_like(
                input_ids, dtype=paddle.get_default_dtype())
        return paddle.unsqueeze(attention_mask, axis=[1, 2])

    def update_scores_for_generation(self, scores, next_scores, length,
                                     unfinished_flag):
        # update scores

        unfinished_scores = (scores * length + next_scores) / (length + 1)
        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
        return scores

    def get_logits_processor(self,
                             min_length=None,
                             max_length=None,
                             eos_token_id=None,
                             forced_bos_token_id=None,
                             forced_eos_token_id=None,
                             num_beams=1,
                             num_beam_groups=1,
                             diversity_rate=0.0,
                             repetition_penalty=None):
        processors = LogitsProcessorList()

        if min_length is not None and eos_token_id is not None and min_length > -1:
            processors.append(
                MinLengthLogitsProcessor(min_length, eos_token_id))
        if num_beam_groups > 1 and diversity_rate > 0.0:
            processors.append(
                HammingDiversityLogitsProcessor(
                    diversity_rate=diversity_rate,
                    num_beams=num_beams,
                    num_beam_groups=num_beam_groups))
        if repetition_penalty is not None and repetition_penalty != 1.0:
            processors.append(
                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
        if forced_bos_token_id is not None:
            processors.append(
                ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
        if forced_eos_token_id is not None:
            processors.append(
                ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
        # TODO
        # Add more pre_processing for distribution

        return processors

    def expand_inputs_for_generation(self,
                                     input_ids,
                                     expand_size,
                                     attention_mask=None,
                                     **model_kwargs):

        index = paddle.tile(
            paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1),
            [1, expand_size]).reshape([-1])

        input_ids = paddle.gather(input_ids, index)

        if attention_mask is not None:
            model_kwargs["attention_mask"] = paddle.gather(attention_mask,
                                                           index)

        if "token_type_ids" in model_kwargs and model_kwargs[
                "token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = paddle.gather(token_type_ids,
                                                           index)

        if "position_ids" in model_kwargs and model_kwargs[
                "position_ids"] is not None:
            position_ids = model_kwargs["position_ids"]
            model_kwargs["position_ids"] = paddle.gather(position_ids, index)

        if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None:
            seq_len = model_kwargs["seq_len"]
            model_kwargs["seq_len"] = paddle.gather(seq_len, index)

        if "encoder_output" in model_kwargs and model_kwargs[
                "encoder_output"] is not None:
            encoder_output = model_kwargs["encoder_output"]
            model_kwargs["encoder_output"] = paddle.gather(encoder_output,
                                                           index)

        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = paddle.gather(role_ids, index)

        return input_ids, model_kwargs

    def prepare_inputs_for_generation(self,
                                      input_ids,
                                      use_cache=False,
                                      cache=None,
                                      **kwargs):
        # only last token for inputs_ids if cache is defined in kwargs
        position_ids = kwargs.get("position_ids", None)
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            if len(attention_mask.shape) == 4:
                attention_mask = attention_mask[:, -1, -1, :]
            if "int" in paddle.common_ops_import.convert_dtype(
                    attention_mask.dtype):
                attention_mask = (1.0 - attention_mask) * -1e4
        return {
            "input_ids": input_ids,
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "cache": cache
        }

    def update_model_kwargs_for_generation(self,
                                           next_tokens,
                                           outputs,
                                           model_kwargs,
                                           is_encoder_decoder=False):
        # Update the model inputs during generation.
        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
        # and they contain pad value, the result vectors updated by this method
        # may be different from expected. In this case, you need to rewrite the
        # method.

        # update cache
        if isinstance(outputs, tuple):
            model_kwargs["cache"] = outputs[1]

        # update token_type_ids with last value
        if "token_type_ids" in model_kwargs and model_kwargs[
                "token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = paddle.concat(
                [token_type_ids, token_type_ids[:, -1:]], axis=-1)

        # update position_ids
        if "position_ids" in model_kwargs and model_kwargs[
                "position_ids"] is not None:
            position_ids = model_kwargs["position_ids"]
            model_kwargs["position_ids"] = position_ids[:, -1:] + 1

        # update attention_mask
        if not is_encoder_decoder and "attention_mask" in model_kwargs:
            attention_mask = model_kwargs["attention_mask"]
            # nn.Pad2D don't support the data type `bool`
            if convert_dtype(attention_mask.dtype) == 'bool':
                attention_mask = paddle.cast(attention_mask, 'int64')
            if len(attention_mask.shape) == 4:
                attention_mask = nn.Pad2D(
                    [0, 0, 0, 1], mode='replicate')(attention_mask)
                attention_mask = nn.Pad2D(
                    [0, 1, 0, 0], value=-1e4)(attention_mask)
                dtype = convert_dtype(attention_mask.dtype)
                if 'int' in dtype:
                    attention_mask[:, :, -1, -1] = 1
                elif 'float' in dtype:
                    attention_mask[:, :, -1, -1] = 0.0
                else:
                    raise ValueError(
                        'The data type of input `attention_mask` must '
                        'be bool, int or float')
            else:
                attention_mask = paddle.concat(
                    [
                        attention_mask, paddle.ones(
                            [attention_mask.shape[0], 1], dtype="int64")
                    ],
                    axis=-1)
            model_kwargs["attention_mask"] = attention_mask

        # update role_ids
        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = paddle.concat(
                [role_ids, role_ids[:, -1:]], axis=-1)

        model_kwargs['res'] = paddle.concat(
            [model_kwargs['res'], next_tokens], axis=1)

        return model_kwargs

    def sample(self,
               input_ids,
               logits_processors,
               max_length,
               pad_token_id,
               eos_token_id,
               top_k=None,
               top_p=None,
               temperature=None,
               min_tokens_to_keep=1,
               **model_kwargs):
        def TopKProcess(probs, top_k, min_tokens_to_keep):
            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
            # Remove all tokens with a probability less than the last token of the top-k
            topk_probs, _ = paddle.topk(probs, k=top_k)
            probs = paddle.where(probs >= topk_probs[:, -1:], probs,
                                 paddle.full_like(probs, 0.0))
            return probs

        def TopPProcess(probs, top_p, min_tokens_to_keep):
            sorted_probs = paddle.sort(probs, descending=True)
            sorted_indices = paddle.argsort(probs, descending=True)
            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)

            # Remove tokens with cumulative probs above the top_p, But keep at
            # least min_tokens_to_keep tokens
            sorted_indices_to_remove = cumulative_probs > top_p
            if min_tokens_to_keep > 1:
                # Set 'min_tokens_to_keep - 1' because the first token is kept
                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0
            # Keep the first token
            sorted_indices_to_remove = paddle.cast(
                sorted_indices_to_remove, dtype='int64')
            sorted_indices_to_remove[:, 1:] = (
                sorted_indices_to_remove[:, :-1].clone())
            sorted_indices_to_remove[:, 0] = 0

            # Scatter sorted tensors to original indexing
            sorted_indices = sorted_indices + paddle.arange(probs.shape[
                0]).unsqueeze(-1) * probs.shape[-1]
            condition = paddle.scatter(sorted_indices_to_remove.flatten(),
                                       sorted_indices.flatten(),
                                       sorted_indices_to_remove.flatten())
            condition = paddle.cast(condition, 'bool').reshape(probs.shape)
            probs = paddle.where(condition,
                                 paddle.full_like(probs, 0.0), probs)
            return probs

        batch_size, cur_len = input_ids.shape
        # used for compute on gpu, avoid memcpy D2H
        cur_len_gpu = paddle.full([1], cur_len, dtype='int64')

        origin_len = input_ids.shape[1]
        # used for compute on gpu, avoid memcpy D2H
        origin_len_gpu = paddle.full([1], origin_len, dtype='int64')

        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
        scores = paddle.full(
            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())

        res = paddle.assign(input_ids)
        model_kwargs['res'] = res

        # use_cache is immutable, we split it off other mutable kwargs.
        assert 'use_cache' in model_kwargs
        immutable = {'use_cache': model_kwargs['use_cache']}
        del model_kwargs['use_cache']

        def _forward_(**args):
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **args, **immutable)
            return self.gpt(**model_inputs, **immutable)

        def _post_process_(outputs, input_ids, cur_len, origin_len, scores,
                           unfinished_flag, model_kwargs):

            logits = outputs[0] if isinstance(outputs, tuple) else outputs

            logits = paddle.matmul(
                logits,
                self.gpt.embeddings.word_embeddings.weight,
                transpose_y=True)

            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = logits_processors(input_ids, logits)

            # sample
            origin_probs = F.softmax(logits)
            if temperature is None or temperature == 1.0:
                probs = paddle.assign(origin_probs)
                origin_probs = paddle.log(origin_probs)
            else:
                origin_probs = paddle.log(origin_probs)
                logits = logits / temperature
                probs = F.softmax(logits)
            if top_k is not None and top_k != 0:
                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
            if top_p is not None and top_p < 1.0:
                if self.use_topp_sampling:
                    try:
                        from ppfleetx_ops import topp_sampling
                    except ImportError:
                        raise ImportError(
                            "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!"
                        )
                    top_ps_tensor = paddle.full(
                        shape=[paddle.shape(probs)[0]],
                        fill_value=top_p,
                        dtype=probs.dtype)
                    _, next_tokens = topp_sampling(
                        probs, top_ps_tensor, random_seed=100)
                else:
                    probs = TopPProcess(probs, top_p, min_tokens_to_keep)

            if not self.use_topp_sampling:
                next_tokens = paddle.multinomial(probs)

            next_scores = paddle.index_sample(origin_probs, next_tokens)

            if eos_token_id is not None:
                next_tokens = paddle.where(
                    unfinished_flag, next_tokens,
                    paddle.full_like(next_tokens, pad_token_id))

            scores = self.update_scores_for_generation(
                scores, next_scores, cur_len - origin_len, unfinished_flag)

            input_ids = next_tokens

            if eos_token_id is not None:
                unfinished_flag = paddle.logical_and(
                    unfinished_flag, next_tokens != eos_token_id)

            model_kwargs = self.update_model_kwargs_for_generation(
                next_tokens,
                outputs,
                model_kwargs,
                is_encoder_decoder=self.is_encoder_decoder)

            return input_ids, scores, unfinished_flag, model_kwargs

        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement
        # the value in model_kwargs should be tensor before while loop
        outputs = _forward_(**model_kwargs)

        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
            outputs, input_ids, cur_len_gpu, origin_len_gpu, scores,
            unfinished_flag, model_kwargs)
        if not self.inference:
            cur_len += 1
        else:
            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
            paddle.increment(cur_len)
        paddle.increment(cur_len_gpu)

        attn_mask = model_kwargs['attention_mask']
        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
        model_kwargs['attention_mask'] = paddle.reshape(
            attn_mask, paddle.shape(attn_mask))
        model_kwargs['cache'] = outputs[1] if isinstance(outputs,
                                                         tuple) else None
        while cur_len < max_length:
            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs)
            # and change it to pass directly to _post_process_ to avoid
            # closed-loop problem of dynamic-to-static model
            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
                _forward_(**model_kwargs), input_ids, cur_len_gpu,
                origin_len_gpu, scores, unfinished_flag, model_kwargs)
            if not self.inference:
                cur_len += 1
            else:
                # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
                paddle.increment(cur_len)
            paddle.increment(cur_len_gpu)

            if not paddle.any(unfinished_flag):
                break

        return model_kwargs['res'][:, origin_len:], scores

    def forward(self, input_ids=None, **model_kwargs):

        max_length = self.max_length
        min_length = self.min_length
        decode_strategy = self.decode_strategy
        temperature = self.temperature
        top_k = self.top_k
        top_p = self.top_p
        repetition_penalty = self.repetition_penalty
        num_beams = self.num_beams
        num_beam_groups = self.num_beam_groups
        length_penalty = self.length_penalty
        early_stopping = self.early_stopping
        bos_token_id = self.bos_token_id
        eos_token_id = self.eos_token_id
        pad_token_id = self.pad_token_id
        decoder_start_token_id = self.decoder_start_token_id
        forced_bos_token_id = self.forced_bos_token_id
        forced_eos_token_id = self.forced_eos_token_id
        num_return_sequences = self.num_return_sequences
        diversity_rate = self.diversity_rate
        use_cache = self.use_cache

        assert (
            decode_strategy in ["greedy_search", "sampling", "beam_search"]
        ), "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format(
            decode_strategy)

        bos_token_id = bos_token_id if bos_token_id is not None else getattr(
            self.gpt, 'bos_token_id', None)
        eos_token_id = eos_token_id if eos_token_id is not None else getattr(
            self.gpt, 'eos_token_id', None)
        pad_token_id = pad_token_id if pad_token_id is not None else getattr(
            self.gpt, 'pad_token_id', None)
        forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr(
            self.gpt, 'forced_bos_token_id', None)
        forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr(
            self.gpt, 'forced_eos_token_id', None)
        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr(
            self.gpt, 'decoder_start_token_id', None)

        # params check
        if input_ids is None:
            # Init `input_ids` with bos_token_id
            input_ids = self.prepare_input_ids_for_generation(bos_token_id)

        if model_kwargs.get("attention_mask", None) is None:
            # TODO
            # Init `attention_mask` depending on `pad_token_id`
            model_kwargs[
                "attention_mask"] = self.prepare_attention_mask_for_generation(
                    input_ids, pad_token_id, eos_token_id)

        if model_kwargs.get("position_ids", None) is None:
            model_kwargs['position_ids'] = paddle.arange(
                0,
                paddle.shape(model_kwargs['attention_mask'])[-1],
                dtype=input_ids.dtype).unsqueeze(0)

        self.is_encoder_decoder = False

        model_kwargs["use_cache"] = use_cache

        if self.inference:
            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
            min_len = input_ids.shape[-1]
            max_len = input_ids.shape[-1]
            paddle.increment(min_len, min_length)
            paddle.increment(max_len, max_length)
        else:
            input_len = input_ids.shape[-1]
            max_len = max_length + input_len
            min_len = min_length + input_len

        logits_processors = self.get_logits_processor(
            min_length=min_len,
            max_length=max_len,
            eos_token_id=eos_token_id,
            forced_bos_token_id=forced_bos_token_id,
            forced_eos_token_id=forced_eos_token_id,
            num_beams=num_beams,
            num_beam_groups=num_beam_groups,
            diversity_rate=diversity_rate,
            repetition_penalty=repetition_penalty)

        if decode_strategy == 'sampling':
            if num_return_sequences > 1:
                input_ids, model_kwargs = self.expand_inputs_for_generation(
                    input_ids,
                    expand_size=num_return_sequences,
                    **model_kwargs)

            ret = self.sample(input_ids, logits_processors, max_len,
                              pad_token_id, eos_token_id, top_k, top_p,
                              temperature, **model_kwargs)
        else:
            raise ValueError(f'Not support {decode_strategy} strategy yet!')
        return ret


================================================
FILE: ppfleetx/models/language_model/language_module.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import copy
import math
import numpy as np
import types

import paddle
from paddle.static import InputSpec
import paddle.distributed.fleet as fleet

from ppfleetx.core.module.basic_module import BasicModule
import ppfleetx.models.language_model.gpt as gpt
from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks
from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger
from .utils import process_configs
from ppfleetx.data.tokenizers import GPTTokenizer
from .metrics import *

# TODO(haohongxiang): to solve the problem of cross-reference
import paddlenlp
from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer

MODEL_CLASSES = {
    "GPT": (GPTTokenizer, "gpt2"),
    "MoE": (GPTTokenizer, "gpt2"),
    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
}


def get_model_size(l, h, v, s):
    P = 0
    # embedding
    P += (v + s) * h
    # attention
    P += (4 * h * h + 4 * h) * l
    # layer_norm of decoder
    P += (2 * (2 * h)) * l
    # FFN Layer
    P += (8 * h * h + 5 * h) * l
    # layer_norm of transformer
    P += 2 * h
    logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0))


def vocab_size_with_padding(vocab_size, div_unit, mp_degree):
    padded_size = vocab_size
    multiple = div_unit * mp_degree
    while (padded_size % multiple) != 0:
        padded_size += 1
    logging.warning(' > padded vocab (size: {}) with {} dummy tokens '
                    '(new size: {})'.format(vocab_size, padded_size -
                                            vocab_size, padded_size))
    return padded_size


class LanguageModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        self.data_world_size = env.get_data_world_size()
        super(LanguageModule, self).__init__(configs)

        self.loss_fn = self.get_loss_fn()

    def process_configs(self, configs):
        configs = process_configs(configs)
        return configs

    def forward(self, tokens, ids):
        return self.model(tokens, ids)

    def training_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch

        loss_mask.stop_gradient = True
        labels.stop_gradient = True
        position_ids.stop_gradient = True

        preds = self(tokens, position_ids)
        loss = self.loss_fn(preds, labels, loss_mask)

        return loss

    def training_step_end(self, log_dict):
        speed = 1. / log_dict['train_cost']
        default_global_tokens_num = self.configs.Global.global_batch_size * \
            self.configs.Data.Train.dataset.max_seq_len

        loss_scale_str = "loss_scale: %.9f," % (
            log_dict['loss_scale']) if log_dict.get('loss_scale',
                                                    None) is not None else ""
        logger.info(
            "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \
            "ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e, found_inf: %.0f"
            % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_step'], log_dict['loss'],
               log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, \
               loss_scale_str, log_dict['lr'], log_dict['found_inf']))

    def validation_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch
        preds = self(tokens, position_ids)
        preds = paddle.cast(preds, dtype="float32")
        loss = self.loss_fn(preds, labels, loss_mask)
        return loss

    def validation_step_end(self, log_dict):
        speed = 1. / log_dict['eval_cost']
        logger.info(
            "[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],
               log_dict['loss'], log_dict['eval_cost'], speed))

    def test_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch
        preds = self(tokens, position_ids)
        preds = paddle.cast(preds, dtype="float32")
        loss = self.loss_fn(preds, labels, loss_mask)
        return loss

    def test_step_end(self, log_dict):
        speed = 1. / log_dict['test_cost']
        logger.info(
            "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               log_dict['test_cost'], speed))

    def training_epoch_end(self, log_dict):
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (log_dict['epoch'], log_dict['train_cost']))


class GPTModule(LanguageModule):
    def __init__(self, configs):
        super(GPTModule, self).__init__(configs)
        if configs.Model.sequence_parallel:
            register_sequence_parallel_allreduce_hooks(
                self, configs.Engine.accumulate_steps,
                configs.Distributed.fuse_sequence_parallel_allreduce)

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        if 'Compress' in self.configs and 'Quantization' in self.configs.Compress:
            quant_setting = copy.deepcopy(self.configs.Compress.Quantization)
            skip_tensor_map = quant_setting.get('skip_tensor_map', {})
            freeze_embedding = quant_setting.get('freeze_embedding', False)
            model_setting['skip_tensor_map'] = skip_tensor_map
            model_setting['freeze_embedding'] = freeze_embedding
        model_setting.pop("module")

        model_name = model_setting.pop("name")
        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        model_setting['vocab_size'] = vocab_size_with_padding(
            model_setting.get('vocab_size', self.tokenizer.vocab_size),
            model_setting.pop('vocab_size_divisible_unit', 128),
            self.configs.Distributed.get('mp_degree', 1))

        l = model_setting['num_layers']
        h = model_setting['hidden_size']
        v = model_setting['vocab_size']
        s = self.configs.Data.Train.dataset.max_seq_len
        get_model_size(l, h, v, s)

        if self.nranks == 1:
            model_setting.pop("sequence_parallel")
            model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
        else:
            model_setting[
                'num_partitions'] = self.configs.Distributed.mp_degree
            if self.configs.Distributed.pp_degree == 1:
                model_setting.pop("virtual_pp_degree", None)
                model = gpt.GPTForPretrainingHybrid(
                    gpt.GPTModelHybrid(**model_setting))
            else:
                model = gpt.GPTForPretrainingPipe(**model_setting)

        return model

    def get_loss_fn(self):
        if self.nranks == 1:
            loss_fn = gpt.GPTPretrainingCriterion()
        else:
            loss_fn = gpt.GPTPretrainingCriterionHybird(
                sequence_parallel=self.configs.Model.sequence_parallel)
        return loss_fn

    def pretreating_batch(self, batch):
        if self.configs.Distributed.pp_degree > 1:
            tokens, position_ids, labels, loss_mask = batch
            data = [(tokens, position_ids), (labels, loss_mask)]
            return data
        else:
            return batch

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                    shape=[None, None], name="ids", dtype='int64')
        ]

    def inference_end(self, outputs):
        for k, v in outputs.items():
            for i in range(v.shape[0]):
                out_ids = [int(x) for x in v[i]]
                ret_str = self.tokenizer.decode(out_ids)
                # ret_str = text[i] + ret_str
                print(ret_str)


class GPTFinetuneModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        self.data_world_size = env.get_data_world_size()
        super(GPTFinetuneModule, self).__init__(configs)

        # self.loss_config will be init in super class by get_model()
        assert self.loss_config is not None
        assert 'train' in self.loss_config
        assert 'eval' in self.loss_config

        train_loss = copy.deepcopy(self.loss_config.train)
        train_loss_cls = train_loss.pop('name')
        self.loss_fn = eval(f'paddle.nn.loss.{train_loss_cls}')(**train_loss)

        eval_loss = copy.deepcopy(self.loss_config.eval)
        eval_loss_cls = eval_loss.pop('name')
        self.eval_loss_fn = eval(f'paddle.nn.loss.{eval_loss_cls}')(
            **eval_loss)

        # self.metric_config will be init in super class by get_model()
        assert self.metric_config is not None
        assert 'eval' in self.metric_config

        if 'train' in self.metric_config:
            train_metric = copy.deepcopy(self.metric_config.train)
            train_metric_cls = train_metric.pop('name')
            self.train_metric = eval(f'{train_metric_cls}')(**train_metric)

        eval_metric = copy.deepcopy(self.metric_config.eval)
        eval_metric_cls = eval_metric.pop('name')
        self.eval_metric = eval(f'{eval_metric_cls}')(**eval_metric)

        self.best_metric = 0.0

    def process_configs(self, configs):
        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")

        self.metric_config = model_setting.pop("metric", None)
        self.loss_config = model_setting.pop("loss", None)

        pretrained = model_setting.pop("pretrained")
        num_classes = model_setting.pop("num_classes", 2)
        assert pretrained is not None

        model_name = model_setting.pop("name")
        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        model_setting['vocab_size'] = vocab_size_with_padding(
            model_setting.get('vocab_size', self.tokenizer.vocab_size),
            model_setting.pop('vocab_size_divisible_unit', 128),
            self.configs.Distributed.get('mp_degree', 1))

        l = model_setting['num_layers']
        h = model_setting['hidden_size']
        v = model_setting['vocab_size']
        num_heads = model_setting['num_attention_heads']
        s = self.configs.Data.Train.dataset.max_length
        get_model_size(l, h, v, s)

        if self.nranks == 1:
            model = gpt.GPTForSequenceClassification(
                gpt.GPTModel(**model_setting), num_classes)
        else:
            raise NotImplementedError

        pretrained_path = pretrained + ".pdparams"
        assert os.path.exists(
            pretrained_path), f'{pretrained_path} is not exists!'
        model_dict = paddle.load(pretrained_path)

        # Note(GuoxiaWang): Guess whether to convert fused vs non-fused parameters.
        # 'q_proj' vs 'qkv_proj'
        def is_fused(model_state):
            for key in model_state:
                if 'qkv_proj' in key:
                    return True
            return False

        def split_params(model_state, num_layers):
            for idx in range(num_layers):
                qkv_b = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias')
                qkv_w = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight')

                qkv_b = qkv_b.reshape((num_heads, 3, -1))
                qkv_w = qkv_w.reshape((h, num_heads, 3, -1))

                q_w, k_w, v_w = np.split(qkv_w, 3, axis=2)
                q_w = q_w.reshape((h, -1))
                k_w = k_w.reshape((h, -1))
                v_w = v_w.reshape((h, -1))

                q_b, k_b, v_b = np.split(qkv_b, 3, axis=1)
                q_b = q_b.reshape((-1))
                k_b = k_b.reshape((-1))
                v_b = v_b.reshape((-1))

                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias'] = q_b
                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight'] = q_w

                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias'] = k_b
                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight'] = k_w

                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias'] = v_b
                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight'] = v_w

            return model_state

        def fuse_params(model_state, num_layers):
            for idx in range(num_layers):
                q_b = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias')
                q_w = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight')

                k_b = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias')
                k_w = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight')

                v_b = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias')
                v_w = model_state.pop(
                    f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight')

                q_w = q_w.reshape((h, num_heads, -1))
                k_w = k_w.reshape((h, num_heads, -1))
                v_w = v_w.reshape((h, num_heads, -1))

                qkv_w = np.stack([q_w, k_w, v_w], axis=2)
                qkv_w = qkv_w.reshape((h, -1))

                q_b = q_b.reshape((num_heads, -1))
                k_b = k_b.reshape((num_heads, -1))
                v_b = v_b.reshape((num_heads, -1))
                qkv_b = np.stack([q_b, k_b, v_b], axis=1)
                qkv_b = qkv_b.reshape((-1))

                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight'] = qkv_w
                model_state[
                    f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias'] = qkv_b
            return model_state

        fused = is_fused(model.state_dict())
        load_fused = is_fused(model_dict)

        if fused is True and load_fused is False:
            model_dict = fuse_params(model_dict, l)
        elif fused is False and load_fused is True:
            model_dict = split_params(model_dict, l)

        for name, param in model.state_dict().items():
            if name in model_dict and param.dtype != model_dict[name].dtype:
                model_dict[name] = model_dict[name].cast(param.dtype)

        model.set_state_dict(model_dict)
        logger.info(f'Load pretrained weight from {pretrained_path}')

        return model

    def forward(self, tokens):
        return self.model(tokens)

    def training_step(self, batch):
        input_ids, labels = batch

        input_ids.stop_gradient = True
        labels.stop_gradient = True

        logits = self(input_ids)
        loss = self.loss_fn(logits, labels)

        return loss

    def training_step_end(self, log_dict):
        speed = 1. / log_dict['train_cost']
        default_global_tokens_num = self.configs.Global.global_batch_size * \
            self.configs.Data.Train.dataset.max_length

        logger.info(
            "[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \
            "ips_total: %.0f tokens/s, ips: %.0f tokens/s"
            % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['lr'], log_dict['loss'], log_dict['train_cost'], speed,
               speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size))

    def validation_step(self, batch):
        input_ids, labels = batch

        input_ids.stop_gradient = True
        labels.stop_gradient = True

        logits = self(input_ids)
        loss = self.eval_loss_fn(logits, labels)
        correct = self.eval_metric.compute(logits, labels)
        self.eval_metric.update(correct)
        return loss

    def validation_step_end(self, log_dict):
        speed = 1. / log_dict['eval_cost']
        logger.info(
            "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               log_dict['eval_cost'], speed))

    def test_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch
        preds = self(tokens, position_ids)
        preds = paddle.cast(preds, dtype="float32")
        loss = self.eval_loss_fn(preds, labels, loss_mask)
        return loss

    def test_step_end(self, log_dict):
        speed = 1. / log_dict['test_cost']
        logger.info(
            "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               log_dict['test_cost'], speed))

    def training_epoch_end(self, log_dict):
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (log_dict['epoch'], log_dict['train_cost']))

    def validation_epoch_end(self, log_dict):
        res = self.eval_metric.accumulate()
        self.eval_metric.reset()
        if isinstance(self.eval_metric, AccuracyAndF1):
            msg = "acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f" % (
                res[0], res[1], res[2], res[3], res[4])
            metric = res[4]
        elif isinstance(self.eval_metric, Mcc):
            msg = "mcc: %.5f" % (res[0])
            metric = res[0]
        elif isinstance(self.eval_metric, PearsonAndSpearman):
            msg = "pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f" % (
                res[0], res[1], res[2])
            metric = res[2]
        else:
            msg = "acc: %.5f" % (res)
            metric = res

        if metric > self.best_metric:
            self.best_metric = metric

        logger.info(
            "[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f" %
            (log_dict['epoch'], log_dict['eval_cost'], msg, self.best_metric))


class GPTGenerationModule(BasicModule):
    def __init__(self, configs):
        self.configs = configs
        self.generation_cfgs = configs.Generation
        self.nranks = paddle.distributed.get_world_size()

        super().__init__(configs)

    def process_configs(self, configs):
        configs = process_configs(configs)
        return configs

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        if 'Compress' in self.configs and 'Quantization' in self.configs.Compress:
            quant_setting = copy.deepcopy(self.configs.Compress.Quantization)
            skip_tensor_map = quant_setting.get('skip_tensor_map', {})
            freeze_embedding = quant_setting.get('freeze_embedding', False)
            model_setting['skip_tensor_map'] = skip_tensor_map
            model_setting['freeze_embedding'] = freeze_embedding
        model_setting.pop("module")

        model_name = model_setting.pop("name")
        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        model_setting['vocab_size'] = vocab_size_with_padding(
            model_setting.get('vocab_size', self.tokenizer.vocab_size),
            model_setting.pop('vocab_size_divisible_unit', 128),
            self.configs.Distributed.get('mp_degree', 1))

        if self.nranks == 1:
            model = gpt.GPTForGeneration(
                gpt.GPTModel(**model_setting), self.generation_cfgs)
        else:
            assert self.nranks == self.configs.Distributed.dp_degree, \
                "only support single card and data parallel in generation task."
            model = gpt.GPTForGenerationHybrid(
                gpt.GPTModelHybrid(**model_setting), self.generation_cfgs)

        self.generation_cfgs['max_dec_len'] = self.adjust_length_to_model(
            self.generation_cfgs['max_dec_len'], 512)

        self.generation_cfgs['bos_token_id'] = self.tokenizer.eos_token_id
        self.generation_cfgs['eos_token_id'] = self.tokenizer.eos_token_id
        self.generation_cfgs['pad_token_id'] = self.tokenizer.eos_token_id

        return model

    def adjust_length_to_model(self, length, max_sequence_length):
        if length < 0 or length > max_sequence_length:
            length = max_sequence_length
        return length

    def left_padding(self, inputs, pad_id, padding="longest"):
        assert "input_ids" in inputs, "input_ids should be in inputs!"
        max_length = 0
        for ids in inputs["input_ids"]:
            max_length = max(max_length, len(ids))

        def extend_max_lenth(value, max_length, to_pad_id):
            return [to_pad_id] * (max_length - len(value)) + value

        def extend_filed(name, max_length, to_pad_id):
            values = inputs[name]
            res = []
            for index, value in enumerate(values):
                res.append(extend_max_lenth(value, max_length, to_pad_id))
            inputs[name] = res

        extend_filed("input_ids", max_length, pad_id)
        if "attention_mask" in inputs:
            extend_filed("attention_mask", max_length, 0)
        if "position_ids" in inputs:
            extend_filed("position_ids", max_length, 0)

        return inputs

    def generate(self, input_text):
        return self(input_text)

    def forward(self, input_text):
        input_ids = self.tokenizer.encode(input_text)
        inputs = {'input_ids': [input_ids]}

        inputs = self.left_padding(inputs, self.tokenizer.eos_token_id)
        input_ids = inputs['input_ids']

        if len(input_ids) == 0:
            input_ids = None
        else:
            # [1, seq_len]
            input_ids = paddle.to_tensor(input_ids, dtype='int64')

        ids, scores = self.model(input_ids=input_ids)

        generated_sequences = []
        for i, generated_ids in enumerate(ids):
            generated_ids = generated_ids.numpy().tolist()
            # Decode text
            text = self.tokenizer.convert_ids_to_string(generated_ids)
            sequence = input_text + text
            generated_sequences.append(sequence)

        return generated_sequences

    def input_spec(self):
        return [InputSpec(shape=[None, None], name="input_ids", dtype='int64')]


class GPTEvalModule(LanguageModule):
    def __init__(self, configs):
        self.eval_cfgs = configs.Offline_Eval

        super().__init__(configs)

        self.post_process_configs()

        self.first_step = True
        self.total_score = 0
        self.score_name = "loss" if not self.eval_cfgs.cloze_eval else "number correct"

    def post_process_configs(self):
        self.configs.pop("Optimizer", None)
        self.configs.pop("Inference", None)

        self.configs.Data.pop("Train", None)
        self.configs.Data.pop("Test", None)
        self.configs.Data.Eval.pop("sampler", None)
        self.configs.Data.Eval.loader.collate_fn = "gpt_collate_fn"
        self.configs.Data.Eval.loader.batch_size = self.eval_cfgs.batch_size
        self.configs.Data.Eval.dataset.input_dir = self.eval_cfgs.eval_path
        self.configs.Data.Eval.dataset.max_seq_len = self.eval_cfgs.max_seq_len

        self.configs.Engine.logging_freq = self.eval_cfgs.logging_freq

        if not self.eval_cfgs.cloze_eval:
            self.configs.Data.Eval.dataset.name = "LM_Eval_Dataset"
            self.configs.Data.Eval.dataset.overlapping_eval = self.eval_cfgs.overlapping_eval
        else:
            self.configs.Data.Eval.dataset.name = "Lambada_Eval_Dataset"

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        if 'Compress' in self.configs and 'Quantization' in self.configs.Compress:
            quant_setting = copy.deepcopy(self.configs.Compress.Quantization)
            skip_tensor_map = quant_setting.get('skip_tensor_map', {})
            freeze_embedding = quant_setting.get('freeze_embedding', False)
            model_setting['skip_tensor_map'] = skip_tensor_map
            model_setting['freeze_embedding'] = freeze_embedding
        model_setting.pop("module")

        model_name = model_setting.pop("name")
        tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)

        model_setting['vocab_size'] = vocab_size_with_padding(
            model_setting.get('vocab_size', self.tokenizer.vocab_size),
            model_setting.pop('vocab_size_divisible_unit', 128),
            self.configs.Distributed.get('mp_degree', 1))

        if self.nranks == 1:
            model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
        else:
            raise RuntimeError(
                "Only single-card offline eval is supported in GPTModel now.")

        return model

    def forward(self, tokens, ids, mask):
        return self.model(tokens, ids, mask)

    def validation_step(self, batch):
        tokens, loss_mask, attention_mask, position_ids, labels, info = batch

        preds = self(tokens, position_ids, attention_mask)

        if not self.eval_cfgs.cloze_eval:
            if self.first_step:
                self.num_original_tokens = info.numpy()[0][0]
                self.num_tokenized_tokens = info.numpy()[0][1]

            masked_lm_loss = paddle.nn.functional.cross_entropy(
                preds, labels, reduction="none")
            loss = paddle.sum(masked_lm_loss * loss_mask)
            return loss
        else:
            if self.first_step:
                self.num_examples = info.numpy()[0][0]

            outputs = paddle.argmax(preds, -1)
            acc = paddle.cast(outputs == labels, 'float32')
            acc = paddle.where(
                paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc))
            acc = paddle.sum(paddle.prod(acc, -1))
            return acc

        self.first_step = False

    def validation_step_end(self, log_dict):
        speed = 1. / log_dict['eval_cost']

        if not self.eval_cfgs.cloze_eval:
            self.total_score += log_dict[
                'loss'] * self.configs.Engine.logging_freq / (
                    self.num_tokenized_tokens - 1)
        else:
            self.total_score += log_dict[
                'loss'] * self.configs.Engine.logging_freq

        logger.info("[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s"
                    % (log_dict['epoch'], log_dict['batch'], self.score_name,
                       self.total_score, speed))

    def validation_epoch_end(self, log_dict):
        if not self.eval_cfgs.cloze_eval:
            total_loss = float(self.total_score)
            ppl = math.exp(min(20, total_loss))
            token_ratio = (self.num_tokenized_tokens - 1) / (
                self.num_original_tokens - 1)
            adjusted_ppl = math.exp(min(20, total_loss * token_ratio))
            string = ' validation results on {} | '.format(
                self.eval_cfgs.eval_path)
            string += 'avg loss: {:.4E} | '.format(total_loss)
            string += 'ppl: {:.4E} | '.format(ppl)
            string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
            string += 'token ratio: {} |'.format(token_ratio)
        else:
            num_correct = float(self.total_score)
            acc = float(num_correct / self.num_examples)
            string = ' validation results on {} | '.format(
                self.eval_cfgs.eval_path)
            string += 'number correct: {:.4E} | '.format(num_correct)
            string += 'total examples: {:.4E} | '.format(self.num_examples)
            string += 'avg accuracy: {:.4E}'.format(acc)

        logger.info(string)

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                    shape=[None, None], name="ids", dtype='int64')
        ]


class MoEModule(LanguageModule):
    def __init__(self, configs):
        super(MoEModule, self).__init__(configs)

        assert self.nranks == configs.Distributed.dp_degree, \
            "only support single card or data parallel in MoE model."

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        model_setting.pop("name")

        l = model_setting['num_layers']
        h = model_setting['hidden_size']
        v = model_setting['vocab_size']
        s = self.configs.Data.Train.dataset.max_seq_len
        get_model_size(l, h, v, s)

        if self.nranks == 1:
            model_setting.pop("sequence_parallel")
            model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
        else:
            model_setting[
                'num_partitions'] = self.configs.Distributed.mp_degree
            if self.configs.Distributed.pp_degree == 1:
                model_setting.pop("virtual_pp_degree", None)
                model = gpt.GPTForPretrainingHybrid(
                    gpt.GPTModelHybrid(**model_setting))
            else:
                model = gpt.GPTForPretrainingPipe(**model_setting)

        return model

    def get_loss_fn(self):
        if self.nranks == 1:
            loss_fn = gpt.GPTPretrainingCriterion()
        else:
            loss_fn = gpt.GPTPretrainingCriterionHybird()
        return loss_fn

    def training_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch

        loss_mask.stop_gradient = True
        labels.stop_gradient = True
        position_ids.stop_gradient = True

        preds = self(tokens, position_ids)
        loss = self.loss_fn(preds, labels, loss_mask)

        with paddle.amp.auto_cast(enable=False):
            if self.configs.Model.gate != "naive" and \
                self.configs.Model.balance_loss_weight:

                gpt_layer = self.model._layers.gpt if isinstance(
                    self.model, paddle.DataParallel) else self.model.gpt

                aux_loss_list = [
                    l.moe_mlp.gate.get_loss(clear=False)
                    for l in gpt_layer.decoder.layers
                    if hasattr(l.moe_mlp, "gate")
                ]
                bal_loss = paddle.concat(aux_loss_list)
                if bal_loss.dtype == paddle.float16:
                    bal_loss = paddle.cast(bal_loss, dtype=paddle.float32)
                bal_loss = bal_loss.mean()
                loss += bal_loss * self.configs.Engine.balance_loss_weight

        return loss

    def initialize_mp_dp_parameters(self):
        hcg = env.get_hcg()
        mp_group = hcg.get_model_parallel_group()
        mp_src_rank = hcg.get_model_parallel_group_src_rank()

        dp_group = hcg.get_data_parallel_group()
        dp_src_rank = hcg.get_data_parallel_group_src_rank()

        for param in self.model.parameters():
            if "expert_" in param.name:
                setattr(param, "no_sync", True)
                continue

            if not param.is_distributed:
                paddle.distributed.broadcast(
                    param.detach(),
                    src=mp_src_rank,
                    group=mp_group,
                    use_calc_stream=True)

            paddle.distributed.broadcast(
                param.detach(),
                src=dp_src_rank,
                group=dp_group,
                use_calc_stream=True)


================================================
FILE: ppfleetx/models/language_model/metrics.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import math
import warnings
from functools import partial

import numpy as np
import paddle
from paddle.metric import Metric, Accuracy, Precision, Recall

__all__ = [
    'Accuracy', 'AccuracyAndF1', 'Mcc', 'PearsonAndSpearman',
    'MultiLabelsMetric'
]


class AccuracyAndF1(Metric):
    """
    This class encapsulates Accuracy, Precision, Recall and F1 metric logic,
    and `accumulate` function returns accuracy, precision, recall and f1.
    The overview of all metrics could be seen at the document of `paddle.metric
    <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/metric/Overview_cn.html>`_
    for details.

    Args:
        topk (int or tuple(int), optional):
            Number of top elements to look at for computing accuracy.
            Defaults to (1,).
        pos_label (int, optional): The positive label for calculating precision
            and recall.
            Defaults to 1.
        name (str, optional):
            String name of the metric instance. Defaults to 'acc_and_f1'.

    Example:

        .. code-block::

            import paddle
            from paddlenlp.metrics import AccuracyAndF1

            x = paddle.to_tensor([[0.1, 0.9], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3]])
            y = paddle.to_tensor([[1], [0], [1], [1]])

            m = AccuracyAndF1()
            correct = m.compute(x, y)
            m.update(correct)
            res = m.accumulate()
            print(res) # (0.5, 0.5, 0.3333333333333333, 0.4, 0.45)

    """

    def __init__(self,
                 topk=(1, ),
                 pos_label=1,
                 name='acc_and_f1',
                 *args,
                 **kwargs):
        super(AccuracyAndF1, self).__init__(*args, **kwargs)
        self.topk = topk
        self.pos_label = pos_label
        self._name = name
        self.acc = Accuracy(self.topk, *args, **kwargs)
        self.precision = Precision(*args, **kwargs)
        self.recall = Recall(*args, **kwargs)
        self.reset()

    def compute(self, pred, label, *args):
        """
        Accepts network's output and the labels, and calculates the top-k
        (maximum value in topk) indices for accuracy.

        Args:
            pred (Tensor):
                Predicted tensor, and its dtype is float32 or float64, and
                has a shape of [batch_size, num_classes].
            label (Tensor):
                The ground truth tensor, and its dtype is is int64, and has a
                shape of [batch_size, 1] or [batch_size, num_classes] in one
                hot representation.

        Returns:
            Tensor: Correct mask, each element indicates whether the prediction
            equals to the label. Its' a tensor with a data type of float32 and
            has a shape of [batch_size, topk].

        """
        self.label = label
        self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label]
        return self.acc.compute(pred, label)

    def update(self, correct, *args):
        """
        Updates the metrics states (accuracy, precision and recall), in order to
        calculate accumulated accuracy, precision and recall of all instances.

        Args:
            correct (Tensor):
                Correct mask for calculating accuracy, and it's a tensor with
                shape [batch_size, topk] and has a dtype of
                float32.

        """
        self.acc.update(correct)
        self.precision.update(self.preds_pos, self.label)
        self.recall.update(self.preds_pos, self.label)

    def accumulate(self):
        """
        Calculates and returns the accumulated metric.

        Returns:
            tuple: The accumulated metric. A tuple of shape (acc, precision,
            recall, f1, average_of_acc_and_f1)

            With the fields:

            - `acc` (numpy.float64):
                The accumulated accuracy.
            - `precision` (numpy.float64):
                The accumulated precision.
            - `recall` (numpy.float64):
                The accumulated recall.
            - `f1` (numpy.float64):
                The accumulated f1.
            - `average_of_acc_and_f1` (numpy.float64):
                The average of accumulated accuracy and f1.

        """
        acc = self.acc.accumulate()
        precision = self.precision.accumulate()
        recall = self.recall.accumulate()
        if precision == 0.0 or recall == 0.0:
            f1 = 0.0
        else:
            # 1/f1 = 1/2 * (1/precision + 1/recall)
            f1 = (2 * precision * recall) / (precision + recall)
        return (
            acc,
            precision,
            recall,
            f1,
            (acc + f1) / 2, )

    def reset(self):
        """
        Resets all metric states.
        """
        self.acc.reset()
        self.precision.reset()
        self.recall.reset()
        self.label = None
        self.preds_pos = None

    def name(self):
        """
        Returns name of the metric instance.

        Returns:
           str: The name of the metric instance.

        """
        return self._name


class Mcc(Metric):
    """
    This class calculates `Matthews correlation coefficient <https://en.wikipedia.org/wiki/Matthews_correlation_coefficient>`_ .

    Args:
        name (str, optional):
            String name of the metric instance. Defaults to 'mcc'.

    Example:

        .. code-block::

            import paddle
            from paddlenlp.metrics import Mcc

            x = paddle.to_tensor([[-0.1, 0.12], [-0.23, 0.23], [-0.32, 0.21], [-0.13, 0.23]])
            y = paddle.to_tensor([[1], [0], [1], [1]])

            m = Mcc()
            (preds, label) = m.compute(x, y)
            m.update((preds, label))
            res = m.accumulate()
            print(res) # (0.0,)

    """

    def __init__(self, name='mcc', *args, **kwargs):
        super(Mcc, self).__init__(*args, **kwargs)
        self._name = name
        self.tp = 0  # true positive
        self.fp = 0  # false positive
        self.tn = 0  # true negative
        self.fn = 0  # false negative

    def compute(self, pred, label, *args):
        """
        Processes the pred tensor, and returns the indices of the maximum of each
        sample.

        Args:
            pred (Tensor):
                The predicted value is a Tensor with dtype float32 or float64.
                Shape is [batch_size, 1].
            label (Tensor):
                The ground truth value is Tensor with dtype int64, and its
                shape is [batch_size, 1].

        Returns:
            tuple: A tuple of preds and label. Each shape is
            [batch_size, 1], with dtype float32 or float64.

        """
        preds = paddle.argsort(pred, descending=True)[:, :1]
        return (preds, label)

    def update(self, preds_and_labels):
        """
        Calculates states, i.e. the number of true positive, false positive,
        true negative and false negative samples.

        Args:
            preds_and_labels (tuple[Tensor]):
                Tuple of predicted value and the ground truth label, with dtype
                float32 or float64. Each shape is [batch_size, 1].

        """
        preds = preds_and_labels[0]
        labels = preds_and_labels[1]
        if isinstance(preds, paddle.Tensor):
            preds = preds.numpy()
        if isinstance(labels, paddle.Tensor):
            labels = labels.numpy().reshape(-1, 1)
        sample_num = labels.shape[0]
        for i in range(sample_num):
            pred = preds[i]
            label = labels[i]
            if pred == 1:
                if pred == label:
                    self.tp += 1
                else:
                    self.fp += 1
            else:
                if pred == label:
                    self.tn += 1
                else:
                    self.fn += 1

    def accumulate(self):
        """
        Calculates and returns the accumulated metric.

        Returns:
            tuple: Returns the accumulated metric, a tuple of shape (mcc,), `mcc` is the accumulated mcc and its data
            type is float64.

        """
        if self.tp == 0 or self.fp == 0 or self.tn == 0 or self.fn == 0:
            mcc = 0.0
        else:
            # mcc = (tp*tn-fp*fn)/ sqrt(tp+fp)(tp+fn)(tn+fp)(tn+fn))
            mcc = (self.tp * self.tn - self.fp * self.fn) / math.sqrt(
                (self.tp + self.fp) * (self.tp + self.fn) *
                (self.tn + self.fp) * (self.tn + self.fn))
        return (mcc, )

    def reset(self):
        """
        Resets all metric states.
        """
        self.tp = 0  # true positive
        self.fp = 0  # false positive
        self.tn = 0  # true negative
        self.fn = 0  # false negative

    def name(self):
        """
        Returns name of the metric instance.

        Returns:
            str: The name of the metric instance.

        """
        return self._name


class PearsonAndSpearman(Metric):
    """
    The class calculates `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
    and `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ .


    Args:
        name (str, optional):
            String name of the metric instance. Defaults to 'pearson_and_spearman'.

    Example:

        .. code-block::

            import paddle
            from paddlenlp.metrics import PearsonAndSpearman

            x = paddle.to_tensor([[0.1], [1.0], [2.4], [0.9]])
            y = paddle.to_tensor([[0.0], [1.0], [2.9], [1.0]])

            m = PearsonAndSpearman()
            m.update((x, y))
            res = m.accumulate()
            print(res) # (0.9985229081857804, 1.0, 0.9992614540928901)

    """

    def __init__(self, name='pearson_and_spearman', *args, **kwargs):
        super(PearsonAndSpearman, self).__init__(*args, **kwargs)
        self._name = name
        self.preds = []
        self.labels = []

    def update(self, preds_and_labels):
        """
        Ensures the type of preds and labels is numpy.ndarray and reshapes them
        into [-1, 1].

        Args:
            preds_and_labels (tuple[Tensor] or list[Tensor]):
                Tuple or list of predicted value and the ground truth label.
                Its data type should be float32 or float64 and its shape is [batch_size, d0, ..., dN].

        """
        preds = preds_and_labels[0]
        labels = preds_and_labels[1]
        if isinstance(preds, paddle.Tensor):
            preds = preds.numpy()
        if isinstance(labels, paddle.Tensor):
            labels = labels.numpy()
        preds = np.squeeze(preds.reshape(-1, 1)).tolist()
        labels = np.squeeze(labels.reshape(-1, 1)).tolist()
        self.preds.append(preds)
        self.labels.append(labels)

    def accumulate(self):
        """
        Calculates and returns the accumulated metric.

        Returns:
            tuple: Returns the accumulated metric, a tuple of (pearson, spearman,
            the_average_of_pearson_and_spearman).

            With the fields:

            - `pearson` (numpy.float64):
                The accumulated pearson.

            - `spearman` (numpy.float64):
                The accumulated spearman.

            - `the_average_of_pearson_and_spearman` (numpy.float64):
                The average of accumulated pearson and spearman correlation
                coefficient.

        """
        preds = [item for sublist in self.preds for item in sublist]
        labels = [item for sublist in self.labels for item in sublist]
        pearson = self.pearson(preds, labels)
        spearman = self.spearman(preds, labels)
        return (
            pearson,
            spearman,
            (pearson + spearman) / 2, )

    def pearson(self, preds, labels):
        n = len(preds)
        # simple sums
        sum1 = sum(float(preds[i]) for i in range(n))
        sum2 = sum(float(labels[i]) for i in range(n))
        # sum up the squares
        sum1_pow = sum([pow(v, 2.0) for v in preds])
        sum2_pow = sum([pow(v, 2.0) for v in labels])
        # sum up the products
        p_sum = sum([preds[i] * labels[i] for i in range(n)])

        numerator = p_sum - (sum1 * sum2 / n)
        denominator = math.sqrt(
            (sum1_pow - pow(sum1, 2) / n) * (sum2_pow - pow(sum2, 2) / n))
        if denominator == 0:
            return 0.0
        return numerator / denominator

    def spearman(self, preds, labels):
        preds_rank = self.get_rank(preds)
        labels_rank = self.get_rank(labels)

        total = 0
        n = len(preds)
        for i in range(n):
            total += pow((preds_rank[i] - labels_rank[i]), 2)
        spearman = 1 - float(6 * total) / (n * (pow(n, 2) - 1))
        return spearman

    def get_rank(self, raw_list):
        x = np.array(raw_list)
        r_x = np.empty(x.shape, dtype=int)
        y = np.argsort(-x)
        for i, k in enumerate(y):
            r_x[k] = i + 1
        return r_x

    def reset(self):
        """
        Resets all metric states.
        """
        self.preds = []
        self.labels = []

    def name(self):
        """
        Returns name of the metric instance.

        Returns:
           str: The name of the metric instance.

        """
        return self._name


class MultiLabelsMetric(Metric):
    """
    This class encapsulates Accuracy, Precision, Recall and F1 metric logic in
    multi-labels setting (also the binary setting).
    Some codes are taken and modified from sklearn.metrics .

    Args:
        num_labels (int)
            The total number of labels which is usually the number of classes
        name (str, optional):
            String name of the metric instance. Defaults to 'multi_labels_metric'.

    Example:

        .. code-block::

            import paddle
            from paddlenlp.metrics import MultiLabelsMetric

            x = paddle.to_tensor([[0.1, 0.2, 0.9], [0.5, 0.8, 0.5], [0.6, 1.5, 0.4], [2.8, 0.7, 0.3]])
            y = paddle.to_tensor([[2], [1], [2], [1]])

            m = MultiLabelsMetric(num_labels=3)
            args = m.compute(x, y)
            m.update(args)

            result1 = m.accumulate(average=None)
            # (array([0.0, 0.5, 1.0]), array([0.0, 0.5, 0.5]), array([0.0, 0.5, 0.66666667]))
            result2 = m.accumulate(average='binary', pos_label=0)
            # (0.0, 0.0, 0.0)
            result3 = m.accumulate(average='binary', pos_label=1)
            # (0.5, 0.5, 0.5)
            result4 = m.accumulate(average='binary', pos_label=2)
            # (1.0, 0.5, 0.6666666666666666)
            result5 = m.accumulate(average='micro')
            # (0.5, 0.5, 0.5)
            result6 = m.accumulate(average='macro')
            # (0.5, 0.3333333333333333, 0.38888888888888884)
            result7 = m.accumulate(average='weighted')
            # (0.75, 0.5, 0.5833333333333333)

    Note: When zero_division is encountered (details as followed), the corresponding metrics will be set to 0.0
        precision is zero_division if there are no positive predictions
        recall is zero_division if there are no positive labels
        fscore is zero_division if all labels AND predictions are negative
    """

    def __init__(self, num_labels, name='multi_labels_metric'):
        super(MultiLabelsMetric, self).__init__()
        if num_labels <= 1:
            raise ValueError(
                f"The num_labels is {num_labels}, which must be greater than 1."
            )
        self.num_labels = num_labels
        self._name = name
        self._confusion_matrix = np.zeros((num_labels, 2, 2), dtype=int)

    def update(self, args):
        """
        Updates the metrics states (accuracy, precision and recall), in order to
        calculate accumulated accuracy, precision and recall of all instances.

        Args:
            args (tuple of Tensor):
                the tuple returned from `compute` function
        """
        pred = args[0].numpy()
        label = args[1].numpy()
        tmp_confusion_matrix = self._multi_labels_confusion_matrix(pred, label)
        self._confusion_matrix += tmp_confusion_matrix

    def accumulate(self, average=None, pos_label=1):
        """
        Calculates and returns the accumulated metric.

        Args:
            average (str in {‘binary’, ‘micro’, ‘macro’, ’weighted’} or None, optional):
            Defaults to `None`. If `None`, the scores for each class are returned.
            Otherwise, this determines the type of averaging performed on the data:

            - `binary` :
                Only report results for the class specified by pos_label.

            - `micro` :
                Calculate metrics globally by counting the total true positives,
                false negatives and false positives.

            - `macro` :
                Calculate metrics for each label, and find their unweighted mean.
                This does not take label imbalance into account.

            - `weighted` :
                Calculate metrics for each label, and find their average weighted
                by support (the number of true instances for each label). This
                alters `macro` to account for label imbalance; it can result in
                an F-score that is not between precision and recall.

            pos_label (int, optional):
                The positive label for calculating precision and recall in binary settings.
                Noted: Only when `average='binary'`, this arguments will be used. Otherwise,
                it will be ignored.
                Defaults to 1.

        Returns:
            tuple: The accumulated metric. A tuple of shape (precision, recall, f1)
                With the fields:

                - `precision` (numpy.float64 or numpy.ndarray if average=None):
                    The accumulated precision.
                - `recall` (numpy.float64 or numpy.ndarray if average=None):
                    The accumulated recall.
                - `f1` (numpy.float64 or numpy.ndarray if average=None):
                    The accumulated f1.

        """
        if average not in {'binary', 'micro', 'macro', 'weighted', None}:
            raise ValueError(f"The average is {average}, which is unknown.")
        if average == 'binary':
            if pos_label >= self.num_labels:
                raise ValueError(
                    f"The pos_label is {pos_label}, num_labels is {self.num_labels}. "
                    f"The num_labels must be greater than pos_label.")

        confusion_matrix = None  # [*, 2, 2]
        if average == 'binary':
            confusion_matrix = np.expand_dims(
                self._confusion_matrix[pos_label], axis=0)
        elif average == 'micro':
            confusion_matrix = self._confusion_matrix.sum(axis=0,
                                                          keepdims=True)
        #  if average is 'macro' or 'weighted' or None
        else:
            confusion_matrix = self._confusion_matrix

        tp = confusion_matrix[:, 1, 1]  # [*,]
        pred = tp + confusion_matrix[:, 0, 1]  # [*,]
        true = tp + confusion_matrix[:, 1, 0]  # [*,]

        def _robust_divide(numerator, denominator, metric_name):
            mask = denominator == 0.0
            denominator = denominator.copy()
            denominator[mask] = 1  # avoid zero division
            result = numerator / denominator

            if not np.any(mask):
                return result

            # precision is zero_division if there are no positive predictions
            # recall is zero_division if there are no positive labels
            # fscore is zero_division if all labels AND predictions are negative
            warnings.warn(f'Zero division when calculating {metric_name}.',
                          UserWarning)
            result[mask] = 0.0
            return result

        precision = _robust_divide(tp, pred, 'precision')
        recall = _robust_divide(tp, true, 'recall')
        f1 = _robust_divide(2 * (precision * recall), (precision + recall),
                            'f1')

        weights = None  # [num_labels]
        if average == 'weighted':
            weights = true
            if weights.sum() == 0:
                zero_division_value = np.float64(0.0)
                if pred.sum() == 0:
                    return (zero_division_value, zero_division_value,
                            zero_division_value)
                else:
                    return (np.float64(0.0), zero_division_value,
                            np.float64(0.0))
        elif average == 'macro':
            weights = np.ones((self.num_labels), dtype=float)
        if average is not None:
            precision = np.average(precision, weights=weights)
            recall = np.average(recall, weights=weights)
            f1 = np.average(f1, weights=weights)

        return precision, recall, f1

    def compute(self, pred, label):
        """
        Accepts network's output and the labels, and calculates the top-k
        (maximum value in topk) indices for accuracy.

        Args:
            pred (Tensor):
                Predicted tensor, and its dtype is float32 or float64, and
                has a shape of [batch_size, *, num_labels].
            label (Tensor):
                The ground truth tensor, and its dtype is is int64, and has a
                shape of [batch_size, *] or [batch_size, *, num_labels] in one
                hot representation.

        Returns:
            tuple of Tensor: it contains two Tensor of shape [*, 1].
            The tuple should be passed to `update` function.
        """
        if not (paddle.is_tensor(pred) and paddle.is_tensor(label)):
            raise ValueError('pred and label must be paddle tensor')

        if pred.shape[-1] != self.num_labels:
            raise ValueError(f'The last dim of pred is {pred.shape[-1]}, '
                             f'which should be num_labels')
        pred = paddle.reshape(pred, [-1, self.num_labels])
        pred = paddle.argmax(pred, axis=-1)

        if label.shape[-1] == self.num_labels:
            label = paddle.reshape(label, [-1, self.num_labels])
            label = paddle.argmax(label, axis=-1)
        else:
            label = paddle.reshape(label, [-1])
            if paddle.max(label) >= self.num_labels:
                raise ValueError(
                    f"Tensor label has value {paddle.max(label)}, "
                    f"which is no less than num_labels")

        if pred.shape[0] != label.shape[0]:
            raise ValueError(
                f"The length of pred is not equal to the length of label")

        return pred, label

    def _multi_labels_confusion_matrix(self, pred, label):
        tp_bins = label[pred == label]
        tp = np.bincount(tp_bins, minlength=self.num_labels)  # [num_labels,]
        tp_plus_fp = np.bincount(
            pred, minlength=self.num_labels)  # [num_labels,]
        tp_plus_fn = np.bincount(
            label, minlength=self.num_labels)  # [num_labels,]
        fp = tp_plus_fp - tp  # [num_labels,]
        fn = tp_plus_fn - tp  # [num_labels,]
        tn = pred.shape[0] - tp - fp - fn  # [num_labels,]
        return np.array([tn, fp, fn, tp]).T.reshape(-1, 2,
                                                    2)  # [num_labels, 2, 2]

    def reset(self):
        self._confusion_matrix = np.zeros((self.num_labels, 2, 2), dtype=int)

    def name(self):
        """
        Returns name of the metric instance.

        Returns:
           str: The name of the metric instance.

        """
        return self._name


================================================
FILE: ppfleetx/models/language_model/moe/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .gate import GShardGate, BaseGate, SwitchGate, NaiveGate
from .moe_layer import MoELayer


================================================
FILE: ppfleetx/models/language_model/moe/comm/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/moe/comm_ops.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import paddle
from paddle.autograd import PyLayer
from paddle.distributed.utils.moe_utils import global_scatter, global_gather
from .utils import _local_scatter, _local_gather, _all_gather


class MoEScatter(PyLayer):
    r"""
    Scatter input samples from [batch x sequences] to contiguous alone experts.
    If `world_size` is greater than 1, the samples will first be locally
    scattered, and then exchanged across workers.
    """

    @staticmethod
    def forward(ctx,
                inp,
                pos,
                local_expert_count,
                global_expert_count,
                fwd_batch_size,
                world_size,
                group=None):
        local_input_buf = _local_scatter(inp, pos)
        if world_size > 1:
            global_input_buf = global_scatter(
                local_input_buf,
                local_expert_count,
                global_expert_count,
                group=group)
        else:
            global_input_buf = local_input_buf

        ctx.moe_args = inp.shape[0], world_size, group

        variables = (pos, local_expert_count, global_expert_count)
        ctx.save_for_backward(*variables)
        return global_input_buf

    @staticmethod
    def backward(ctx, grad):
        (pos, local_expert_count, global_expert_count) = ctx.saved_tensor()
        (inp_batch_size, world_size, group) = ctx.moe_args

        if world_size > 1:
            local_grad_in = global_gather(
                grad, local_expert_count, global_expert_count, group=group)
        else:
            local_grad_in = grad
        grad_in = _local_gather(local_grad_in, pos, inp_batch_size)
        return grad_in, None, None, None


class MoEGather(PyLayer):
    r"""
    Gather output samples from contiguous alone experts back to [batch x
    sequences]. Works symmetrically with MoEScatter.
    """

    @staticmethod
    def forward(ctx,
                global_output_buf,
                pos,
                local_expert_count,
                global_expert_count,
                local_batch_size,
                world_size,
                group=None):
        if world_size > 1:
            local_output_buf = global_gather(
                global_output_buf,
                local_expert_count,
                global_expert_count,
                group=group)
        else:
            local_output_buf = global_output_buf
        output = _local_gather(
            local_output_buf, pos, local_batch_size, maybe_overlap=False)

        ctx.moe_args = (global_output_buf.shape[0], world_size, group)
        variables = (pos, local_expert_count, global_expert_count)
        ctx.save_for_backward(*variables)
        return output

    @staticmethod
    def backward(ctx, grad_out):
        pos, local_expert_count, global_expert_count = ctx.saved_tensor()
        fwd_batch_size, world_size, group = ctx.moe_args
        grad_out_buf = _local_scatter(grad_out, pos)
        if world_size > 1:
            global_grad_out_buf = global_scatter(
                grad_out_buf,
                local_expert_count,
                global_expert_count,
                group=group)
        else:
            global_grad_out_buf = grad_out_buf
        return global_grad_out_buf, None, None, None


class AllGather(PyLayer):
    r"""
    A wrapper for the All-Gather function to support auto-differentiation.
    """

    @staticmethod
    def forward(ctx, inp, rank, world_size, group):
        tensor_list = []
        paddle.distributed.all_gather(tensor_list, inp, group=group)
        output = paddle.concat(tensor_list, axis=0)
        ctx.args = rank, inp.shape[0]
        return output

    @staticmethod
    def backward(ctx, grad_out):
        rank, dim0 = ctx.args
        return paddle.slice(
            grad_out, axes=[0], starts=[rank * dim0], ends=[(rank + 1) * dim0])


class Slice(PyLayer):
    r"""
    A wrapper for the Slice function to support auto-differentiation.
    """

    @staticmethod
    def forward(ctx, inp, rank, world_size, group):
        B = inp.shape[0]
        local_batch_size = B // world_size
        batch_start = local_batch_size * rank
        batch_end = min(batch_start + local_batch_size, B)
        inp = paddle.slice(
            inp, axes=[0], starts=[batch_start], ends=[batch_end])
        ctx.args = world_size, group
        return inp

    @staticmethod
    def backward(ctx, grad_out):
        world_size, group = ctx.args
        return _all_gather(grad_out, group=group)


================================================
FILE: ppfleetx/models/language_model/moe/gate/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .gshard_gate import GShardGate
from .switch_gate import SwitchGate
from .naive_gate import NaiveGate
from .base_gate import BaseGate


================================================
FILE: ppfleetx/models/language_model/moe/gate/base_gate.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/base_gate.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import paddle.nn as nn


class BaseGate(nn.Layer):
    def __init__(self, num_expert, group=None):
        super().__init__()
        self.world_size = group.nranks if group is not None else 1
        self.num_expert = num_expert
        self.tot_expert = self.world_size * num_expert
        self.loss = None

    def forward(self, x):
        raise NotImplementedError("Please implement the forward function.")

    def set_loss(self, loss):
        self.loss = loss

    def get_loss(self, clear=True):
        loss = self.loss
        if clear:
            self.loss = None
        return loss


================================================
FILE: ppfleetx/models/language_model/moe/gate/gshard_gate.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import math
import paddle
import paddle.nn.functional as F
from .naive_gate import NaiveGate
from ..utils import limit_by_capacity


class GShardGate(NaiveGate):
    def __init__(self,
                 d_model,
                 num_expert,
                 topk=2,
                 capacity=(1.2, 2.4),
                 random_routing=True,
                 group=None):
        assert topk == 2, "topk should be 2 in gshard"
        super().__init__(d_model, num_expert, group)
        self.capacity = capacity
        self.random_routing = random_routing
        self.group = group

    def forward(self, x):
        topk_val, topk_idx, gate_score = super().forward(
            x, return_all_scores=True)
        s = gate_score.shape[0]
        top1_idx = topk_idx.flatten()
        c_e = paddle.scatter(
            paddle.zeros(shape=[self.tot_expert]),
            top1_idx,
            paddle.ones_like(
                top1_idx, dtype="float32"),
            overwrite=False) / s
        m_e = paddle.mean(F.softmax(gate_score, axis=1), axis=0)
        loss = paddle.mean(c_e * m_e) * (self.num_expert**2)
        self.set_loss(loss)

        cap_rate = self.capacity[0 if self.training else 1]
        capacity = math.ceil(cap_rate * x.shape[0])
        _new_lec, _new_gec, topk_idx = limit_by_capacity(
            topk_idx,
            self.num_expert,
            self.world_size,
            capacity,
            group=self.group)

        if self.random_routing:
            rand_routing_prob = paddle.rand(
                shape=[gate_score.shape[0]], dtype="float32")
            topk_idx = paddle.distributed.models.moe.utils._random_routing(
                topk_idx, topk_val, rand_routing_prob)
        return topk_val, topk_idx


================================================
FILE: ppfleetx/models/language_model/moe/gate/naive_gate.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/naive_gate.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

from .base_gate import BaseGate

import paddle
import paddle.nn as nn


class NaiveGate(BaseGate):
    def __init__(self, d_model, num_expert, group=None, topk=2):
        super().__init__(num_expert, group)
        self.gate = nn.Linear(d_model, self.tot_expert)
        self.gate.weight.name = "gate_" + self.gate.weight.name
        self.gate.bias.name = "gate_" + self.gate.bias.name
        self.top_k = topk

    def forward(self, inp, return_all_scores=False):
        gate = self.gate(inp)
        gate_top_k_val, gate_top_k_idx = paddle.topk(
            gate, k=self.top_k, axis=-1, largest=True, sorted=False)

        if return_all_scores:
            return gate_top_k_val, gate_top_k_idx, gate
        return gate_top_k_val, gate_top_k_idx


================================================
FILE: ppfleetx/models/language_model/moe/gate/switch_gate.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/switch_gate.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import math
import paddle
import paddle.nn.functional as F
from .naive_gate import NaiveGate
from ..utils import limit_by_capacity


class SwitchGate(NaiveGate):
    def __init__(self,
                 d_model,
                 num_expert,
                 topk=1,
                 switch_eps=.1,
                 capacity=(1.2, 2.4),
                 group=None):
        assert topk == 1, "topk should be 1 in switch"
        super().__init__(d_model, num_expert, group, topk=1)
        self.switch_eps = switch_eps
        self.capacity = capacity
        self.group = group

    def forward(self, inp):
        score = self.gate(inp)

        if self.training:
            noise = paddle.rand(shape=score.shape)
            noise = noise * 2 * self.switch_eps + 1.0 - self.switch_eps
            score += noise

        score = F.softmax(score, axis=-1)
        top1_score, top1_idx = paddle.topk(score, k=1, axis=-1, largest=True)

        cap_rate = self.capacity[0 if self.training else 1]
        capacity = math.ceil(cap_rate * inp.shape[0])
        _new_lec, _new_gec, top1_idx = limit_by_capacity(
            top1_idx,
            self.num_expert,
            self.world_size,
            capacity,
            group=self.group)
        valid_idx = top1_idx[top1_idx > -1]
        valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1])
        fraction_expert = paddle.scatter_nd_add(
            x=paddle.zeros(shape=[self.tot_expert]),
            index=valid_idx_tmp,
            updates=paddle.ones_like(
                valid_idx, dtype=paddle.float32).reshape(
                    shape=[len(valid_idx)]), ) / valid_idx.numel()
        prob_expert = score.sum(axis=0) / valid_idx.numel()
        loss = (fraction_expert * prob_expert).sum() * self.tot_expert
        self.set_loss(loss)

        return top1_score, top1_idx


================================================
FILE: ppfleetx/models/language_model/moe/moe_layer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/layers.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import numpy as np
import paddle
import paddle.nn as nn

from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate
from .comm_ops import MoEScatter, MoEGather, AllGather, Slice
from .utils import prepare_forward
from paddle.distributed.fleet.utils import recompute
from paddle.incubate.distributed.fleet import recompute_hybrid


class MoELayer(nn.Layer):
    """MoE Layer
    Args:
        d_model: (int) model dimention
        experts: (list|nn.LayerList) expert networks list
        gate: (str|BaseGate|None):
                if gate is a str, it can only be "naive", "gshard", "switch" or None, default is "naive"
                else gate is an instance of BaseGate
        
        top_k: (int) default value is 2
        moe_group: moe group for experts communication
        mp_group: mp group for mp commutication
        recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute.
        recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given.
    Examples:
        .. code-block:: python
        from paddle.nn import layer, LayerList
        from paddle.distributed.moe import MoElayer
        from paddle.distributed.collective import Group
        from paddle.distributed import fleet

        moe_group = Group(fleet.worker_index(),
                          0,
                          list(range(fleet.worker_num())))
        mp_group = None

        num_experts=8
        dim_feedforward=512
        d_model=8
        top_k=2

        class ExpertLayer(Layer):
            def __init__(self, d_model, d_hidden, name=None):
                super(ExpertLayer, self).__init__()
                self.htoh4 = nn.Linear(d_model, d_hidden)
                self.h4toh = nn.Linear(d_hidden, d_model)

            def forward(self, x):
                x = self.htoh4(x)
                x = self.h4toh(x)
                return x

        experts_list = LayerList()
        for expi in range(num_experts):
            exp_layer = ExpertLayer(d_model, dim_feedforward)
            experts_list.append(exp_layer)

        moeLayer = MoELayer(d_model = d_model,
                            experts=experts_list,
                            gate="gshard",
                            top_k=2,
                            moe_group=moe_group,
                            mp_group=mp_group,
                            recompute_interval=0)

    """

    def __init__(self,
                 d_model,
                 experts,
                 moe_group=None,
                 mp_group=None,
                 top_k=2,
                 gate=None,
                 recompute_interval=0,
                 recompute_partition=False,
                 recompute_offload=False):
        super(MoELayer, self).__init__()

        self.d_model = d_model

        assert experts is not None
        assert isinstance(experts, (list, nn.LayerList)), \
             "The type of experts must be list or nn.LayerList"

        for i, exp in enumerate(experts):
            assert isinstance(
                exp,
                nn.Layer), "The type of experts[{}] must be nn.Layer".format(i)

        self.experts = nn.LayerList(experts) if isinstance(experts,
                                                           list) else experts
        self.num_experts = len(experts)

        gate = "naive" if gate is None else gate
        assert isinstance(gate, (str, BaseGate)), \
             "The type of gate must be str or an instance of BaseGate"
        self.top_k = top_k

        # only support mp/dp
        self.group = moe_group
        self.mp_group = mp_group

        self.world_size = self.group.nranks \
            if self.group is not None else 1

        if isinstance(gate, str):
            gate_map = {
                "naive": NaiveGate,
                "gshard": GShardGate,
                "switch": SwitchGate,
            }

            if gate in gate_map.keys():
                self.gate = gate_map[gate](self.d_model,
                                           num_expert=self.num_expert,
                                           topk=self.top_k,
                                           group=self.group)
            else:
                assert False, "We only support naive gate, \
                                gshard gate and switch gate, \
                                but you choose {} gate.".format(gate)
        elif isinstance(gate, BaseGate):
            self.gate = gate
        else:
            raise TypeError("The type of gate must be either str in ('naive', \
                'gshard', 'switch') or an instance of moe.BaseGate")

        self.recompute_interval = recompute_interval
        self.recompute_ctx = {
            "mp_group": self.mp_group,
            "offload": recompute_offload,
            "partition": recompute_partition,
        }

    def forward(self, inp):
        origin_shape = inp.shape
        inp = inp.reshape_([-1, origin_shape[-1]])

        mp_rank = 0
        mp_size = 1
        if self.mp_group is not None:
            mp_rank = self.mp_group.rank
            mp_size = self.mp_group.nranks
        if mp_size > 1:
            inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group)
        value, gate = self.gate(inp)

        (
            pos,
            local_expert_count,
            global_expert_count,
            fwd_expert_count,
            fwd_batch_size, ) = prepare_forward(gate, self.num_expert,
                                                self.world_size, self.group)

        topk = 1
        if len(gate.shape) == 2:
            topk = gate.shape[1]

        if pos.shape != [0]:
            temp_pos = pos // topk
        else:
            temp_pos = pos
        assert topk == self.top_k

        x = MoEScatter.apply(inp, temp_pos, local_expert_count,
                             global_expert_count, fwd_batch_size,
                             self.world_size, self.group)

        d_model = self.d_model

        def experts_fwd(x, fwd_expert_count, experts):

            if x.shape[0] == 0:
                return x
            y = []
            last_index = 0
            assert isinstance(fwd_expert_count, np.ndarray)
            assert len(experts) == len(fwd_expert_count)
            for idx, expert_count in enumerate(fwd_expert_count):
                if expert_count <= 0:
                    continue
                y.append(experts[idx](x[last_index:expert_count + last_index]))
                last_index = expert_count + last_index
            return paddle.concat(y, axis=0)

        if self.recompute_interval <= 0 or x.shape[0] == 0:
            x = experts_fwd(x, fwd_expert_count.numpy(), self.experts)
        elif self.world_size > 1:
            x = recompute_hybrid(self.recompute_ctx, experts_fwd, x,
                                 fwd_expert_count.numpy(), self.experts)
        else:
            x = recompute(experts_fwd, x,
                          fwd_expert_count.numpy(), self.experts)

        out_batch_size = inp.shape[0]
        if len(gate.shape) == 2:
            out_batch_size *= gate.shape[1]

        x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
                            out_batch_size, self.world_size, self.group)

        x = x.reshape([-1, self.top_k, d_model])
        value = value.reshape([x.shape[0], 1, self.top_k])
        x = paddle.bmm(value, x).reshape([-1, d_model])

        if mp_size > 1:
            x = AllGather.apply(x, mp_rank, mp_size, self.mp_group)

        x = paddle.reshape_(x, origin_shape)

        return x


================================================
FILE: ppfleetx/models/language_model/moe/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import paddle
from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos


def prepare_forward(gate, num_expert, world_size, moe_group):
    pos, local_expert_count, global_expert_count = count_by_gate(
        gate, num_expert, world_size, group=moe_group)
    with paddle.no_grad():
        fwd_expert_count = global_expert_count.reshape_(
            [world_size, num_expert]).sum(axis=0)
        fwd_batch_size = int(fwd_expert_count.sum().item())
    return (
        pos,
        local_expert_count,
        global_expert_count,
        fwd_expert_count,
        fwd_batch_size, )


def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
    if group is not None and not group.is_member():
        return

    group = paddle.distributed.collective._get_default_group(
    ) if group is None else group
    out = paddle.empty(in_tensor_list.shape, in_tensor_list.dtype)
    task = group.process_group.alltoall(in_tensor_list, out)
    task.wait()
    return out


def _local_scatter(inp, pos):
    if pos.shape != [0]:
        inp_buf = paddle.index_select(inp, pos, 0)
    else:
        inp_buf = paddle.empty([0, inp.shape[1]], dtype=inp.dtype)
    return inp_buf


def _local_gather(inp, pos, out_batch_size, maybe_overlap=True):
    if pos.shape != [0]:
        origin_dtype = inp.dtype
        inp = paddle.cast(inp, dtype="float32")
        inp_buf = paddle.scatter(
            paddle.zeros(
                shape=[out_batch_size, inp.shape[-1]], dtype="float32"),
            pos,
            inp,
            overwrite=True)
        inp_buf = paddle.cast(inp_buf, dtype=origin_dtype)
    else:
        inp_buf = paddle.zeros(
            [out_batch_size, inp.shape[-1]], dtype=inp.dtype)
    return inp_buf


def _all_gather(tensor, group=None, use_calc_stream=True):
    if group is not None and not group.is_member():
        return

    group = paddle.distributed.collective._get_default_group(
    ) if group is None else group
    tensor_shape = list(tensor.shape)
    tensor_shape[0] *= group.nranks
    out = paddle.empty(tensor_shape, tensor.dtype)

    task = group.process_group.all_gather(tensor, out)
    task.wait()
    return out


def count_by_gate(gate, num_expert, world_size, require_pos=True, group=None):
    total_expert_count = num_expert * world_size
    with paddle.no_grad():
        local_expert_count = _number_count(gate, total_expert_count)

        if world_size > 1:
            global_expert_count = _alltoall(local_expert_count, group=group)
        else:
            global_expert_count = local_expert_count
        if not require_pos:
            pos = None
        else:
            lec_cum = paddle.cumsum(local_expert_count, axis=0)
            pos = _assign_pos(gate, lec_cum)
    return pos, local_expert_count, global_expert_count


def limit_by_capacity(topk_idx, num_expert, world_size, capacity, group=None):
    with paddle.no_grad():
        capacity = paddle.ones(
            shape=[num_expert], dtype=paddle.int64) * capacity
        pos, lec, gec = count_by_gate(
            topk_idx, num_expert, world_size, require_pos=False, group=group)
        new_gec = _limit_by_capacity(gec, capacity, world_size)
        if world_size > 1:
            assert group.nranks == world_size
            new_lec = _alltoall(new_gec, group=group)
        else:
            new_lec = new_gec

        topk_idx = _prune_gate_by_capacity(topk_idx, new_lec, num_expert,
                                           world_size)

    return new_lec, new_gec, topk_idx


================================================
FILE: ppfleetx/models/language_model/moe_exp/__init__.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/language_model/moe_exp/experts.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.

# The file has been adapted from a deepspeed file:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py
# Git commit hash: a091bc223c01e94448f443456a6c15684644b966
# We retain the following license from the original files:
#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import copy


class Experts(nn.Layer):
    def __init__(self, expert, num_local_experts=1, expert_group_name=None):
        super(Experts, self).__init__()

        self.fleetx_experts = nn.LayerList(
            [copy.deepcopy(expert) for i in range(num_local_experts)])
        self.num_local_experts = num_local_experts

        # TODO: revisit allreduce for moe.gate...
        for expert in self.fleetx_experts:
            # TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group)
            for name, param in expert.named_parameters():
                param.allreduce = False
                param.group_name = expert_group_name

    def forward(self, inputs):
        chunks = paddle.chunk(inputs, chunks=self.num_local_experts, axis=1)
        expert_outputs = []
        for chunk, expert in zip(chunks, self.fleetx_experts):
            out = expert(chunk)
            if type(out) is tuple:
                out = out[0]  # Ignore the bias term for now
            expert_outputs += [out]

        expert_output = paddle.concat(expert_outputs, axis=1)
        return expert_output


================================================
FILE: ppfleetx/models/language_model/moe_exp/layer.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# The file has been adapted from a deepspeed file:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/layer.py
# Git commit hash: a091bc223c01e94448f443456a6c15684644b966
# We retain the following license from the original files:
#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from .experts import Experts
from .sharded_moe import TopKGate, MOELayer


class MoE(nn.Layer):
    def __init__(self,
                 hidden_size,
                 expert,
                 num_experts=1,
                 ep_size=1,
                 k=1,
                 capacity_factor=1.,
                 eval_capacity_factor=1.,
                 min_capacity=4,
                 use_residual=False,
                 noisy_gate_policy=None,
                 drop_tokens=True,
                 use_rts=False,
                 enable_expert_tensor_parallelism=False):
        super(MoE, self).__init__()

        self.use_residual = use_residual
        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
        assert num_experts % ep_size == 0, f"Number of experts ({num_experts}) should be divisible by expert parallel size ({ep_size})"
        self.ep_size = ep_size
        self.expert_group_name = f"ep_size_{self.ep_size}"
        self.num_experts = num_experts
        self.num_local_experts = num_experts // self.ep_size

        # log_dist(
        #     f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}',
        #     [0])

        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
            'Unsupported noisy_gate_policy: ' + noisy_gate_policy

        experts = Experts(expert, self.num_local_experts,
                          self.expert_group_name)
        self.gate = TopKGate(hidden_size, num_experts, k, capacity_factor,
                             eval_capacity_factor, min_capacity,
                             noisy_gate_policy, drop_tokens, use_rts)
        self.fleetx_moe = MOELayer(self.gate, experts, self.expert_group_name,
                                   self.ep_size, self.num_local_experts)
        if self.use_residual:
            self.mlp = expert
            # coefficient is used for weighted sum of the output of expert and mlp
            self.coefficient = nn.Linear(hidden_size, 2)

    def forward(self, hidden_states, used_token=None):
        """ MoE forward

        Arguments:
            hidden_states (Tensor): input to the layer
            used_token (Tensor, optional): default: None, mask only used tokens

        Returns:
            A tuple including output, gate loss, and expert count.

            * output (Tensor): output of the model

            * l_aux (Tensor): gate loss value

            * exp_counts (int): expert count
        """
        output = self.fleetx_moe(hidden_states, used_token)
        if self.use_residual:
            # Residual MoE
            output_mlp = self.mlp(hidden_states)
            if type(output_mlp) is tuple:
                output_mlp = output_mlp[0]  # Ignore the bias term for now
            coef = self.coefficient(hidden_states)
            coef = F.softmax(coef, dim=-1)
            output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
        return output


================================================
FILE: ppfleetx/models/language_model/moe_exp/mappings.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# The file has been adapted from a deepspeed file:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/mappings.py
# Git commit hash: a091bc223c01e94448f443456a6c15684644b966
# We retain the following license from the original files:
#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.distributed as dist
from paddle.autograd import PyLayer


#TODO: set axis for all_gather
def _gather_tokens(input_, group, axis=0):
    """Gather tensors and concatenate them along a axisension"""
    # in case model is not deployed in distributed environment
    group = dist.collective._get_default_group() if group is None else group
    tensor_list = [paddle.empty_like(input_) for _ in range(group.nranks)]
    dist.all_gather(tensor_list, input_, group)
    output_ = paddle.concat(tensor_list, axis=axis)
    return output_


def _drop_tokens(input_, group, axis=0):
    """Divide a tensor among the tensor parallel ranks"""
    # in case model is not deployed in distributed environment
    group = dist.collective._get_default_group() if group is None else group

    total_chunks = group.nranks
    this_chunk = group.rank
    assert input_.shape[
        axis] % total_chunks == 0, f"input dimention {axis} ({input_.shape[axis]}) is not divisible by tensor parallel world size ({total_chunks})"
    chunk_size = input_.shape[axis] // total_chunks

    return paddle.slice(input_, [axis], [this_chunk * chunk_size],
                        [this_chunk * chunk_size + chunk_size])


class _GatherTokens(PyLayer):
    """All gather tokens among the tensor parallel ranks"""

    @staticmethod
    def forward(ctx, input_, group, axis):
        ctx.group = group
        ctx.axis = axis
        return _gather_tokens(input_, group, axis)

    @staticmethod
    def backward(ctx, grad_output):
        return _drop_tokens(grad_output, ctx.group, ctx.axis), None


class _DropTokens(PyLayer):
    "Divide tokens equally among the tensor parallel ranks"

    @staticmethod
    def forward(ctx, input_, group, axis):
        ctx.group = group
        ctx.axis = axis
        return _drop_tokens(input_, axis)

    @staticmethod
    def backward(ctx, grad_output):
        return _gather_tokens(grad_output, ctx.group, ctx.axis), None


def gather_tokens(input_, group=None, axis=0):
    if group is None or group.nranks == 1:
        # no tensor parallelism for non-experts
        return input_
    return _GatherTokens.apply(input_, group, axis)


def drop_tokens(input_, group=None, axis=0):
    if group is None or group.nranks == 1:
        # no tensor parallelism for non-experts
        return input_
    return _DropTokens.apply(input_, group, axis)


================================================
FILE: ppfleetx/models/language_model/moe_exp/sharded_moe.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# The file has been adapted from a deepspeed file:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/sharded_moe.py
# Git commit hash: a091bc223c01e94448f443456a6c15684644b966
# We retain the following license from the original files:
#   Copyright (c) The Microsoft DeepSpeed Team. All rights reserved.
# 

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from typing import Callable, Dict, Tuple, Optional, Any
from paddle.distribution import Uniform, Gumbel
import paddle.nn.functional as F
from paddle import Tensor
import paddle.nn as nn
import paddle.distributed as dist
from paddle.autograd import PyLayer
import paddle.distributed.fleet as fleet

from .mappings import drop_tokens, gather_tokens

uniform_map: Dict[str, Callable] = {}
gumbel_map: Dict[str, Callable] = {}
exp_selection_uniform_map: Dict[str, Callable] = {}


def multiplicative_jitter(x, epsilon=1e-2):
    if epsilon == 0:
        return x
    device = paddle.get_device()
    uniform = uniform_map.get(device)
    if uniform is None:
        uniform = Uniform(
            low=paddle.to_tensor(1.0 - epsilon),
            high=paddle.to_tensor(1.0 + epsilon)).rsample  # type: ignore
        uniform_map[device] = uniform
    return x * uniform(x.shape)


def gumbel_rsample(shape):
    device = paddle.get_device()
    gumbel = gumbel_map.get(device)
    if gumbel is None:
        one = paddle.to_tensor(1.0)
        zero = paddle.to_tensor(0.0)
        gumbel = Gumbel(zero, one).rsample  # type: ignore
        gumbel_map[device] = gumbel
    return gumbel(shape)


# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity
# See https://arxiv.org/pdf/2006.16668.pdf for details.

class _AllToAll(PyLayer):
    @staticmethod
    def forward(ctx: Any, group: dist.collective.Group,
                input: Tensor) -> Tensor:  # type: ignore
        ctx.group = group
        output = paddle.empty_like(input)
        dist.alltoall_single(input, output, group=group)
        return output

    @staticmethod
    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor]:
        return (None, _AllToAll.apply(ctx.group, *grad_output))


# einsum rewrites are on par or more performant
# switch can be bubbled up in future
USE_EINSUM = True


# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity
# See https://arxiv.org/pdf/2006.16668.pdf for details.
def einsum(rule, a, b):
    if USE_EINSUM:
        return paddle.einsum(rule, a, b)
    elif rule == 's,se->se':
        return a.reshape((a.shape[0], -1)) * b
    elif rule == 'se,sc->sec':
        return a.unsqueeze(2) * b.unsqueeze(1)
    elif rule == 'se,se->s':
        return paddle.bmm(paddle.unsqueeze(a, 1),
                          paddle.unsqueeze(b, 2)).reshape((-1))
    elif rule == 'sec,sm->ecm':
        s = a.shape[0]
        e = a.shape[1]
        c = a.shape[2]
        m = b.shape[1]
        return paddle.matmul(a.reshape((s, -1)).t(), b).reshape((e, c, m))
    elif rule == 'sec,ecm->sm':
        return paddle.matmul(
            a.reshape((a.shape[0], -1)), b.reshape((-1, b.shape[-1])))
    elif rule == 'ks,ksm->sm':
        k = b.shape[0]
        s = b.shape[1]
        m = b.shape[2]
        # [k, s] -> [s, k] -> [s, 1, k]
        a = a.t().unsqueeze(1)
        # [k,s,m] -> [k, sm] -> [sm, k] -> [s, m, k]
        b = b.reshape((k, -1)).t().reshape((s, m, k))
        # bmm([s, 1, k], [s, m, k]^t) -> [s, m, 1]
        return paddle.bmm(a, b.transpose(1, 2)).squeeze(2)
    else:
        return paddle.einsum(rule, a, b)

def _capacity(gates, capacity_factor, min_capacity):
    # gates has shape of SE
    num_tokens = gates.shape[0]
    num_experts = gates.shape[1]
    capacity = paddle.ceil(
        (num_tokens / num_experts) * capacity_factor).astype(paddle.int64)
    if capacity < min_capacity:
        capacity = min_capacity.astype(paddle.int64)
    return capacity


def _top_idx(source, k):
    return paddle.topk(source, k=k, axis=0)[1]


def top1gating(logits,
               capacity_factor: float,
               min_capacity: int,
               used_token: Tensor=None,
               noisy_gate_policy: Optional[str]=None,
               drop_tokens: bool=True,
               use_rts: bool=True) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    """Implements Top1Gating on logits."""
    if noisy_gate_policy == 'RSample':
        logits_w_noise = logits + \
            gumbel_rsample(logits.shape, device=logits.device)
    # everything is in fp32 in this function
    gates = F.softmax(logits, axis=1)

    capacity = _capacity(gates,
                         paddle.to_tensor(capacity_factor),
                         paddle.to_tensor(min_capacity))

    # Create a mask for 1st's expert per token
    # noisy gating
    indices1_s = paddle.argmax(
        logits_w_noise if noisy_gate_policy == 'RSample' else gates, axis=1)
    num_experts = int(gates.shape[1])

    assert(0 <= indices1_s.min() and indices1_s.max() < num_experts)
    mask1 = F.one_hot(indices1_s, num_classes=num_experts)

    # mask only used tokens
    if used_token is not None:
        mask1 = einsum("s,se->se", used_token, mask1)

    # gating decisions
    exp_counts = paddle.sum(mask1, axis=0).detach()

    # if we don't want to drop any tokens
    if not drop_tokens:
        new_capacity = paddle.max(exp_counts)
        # dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX,
        #                 group=dist.get_world_group())
        # capacity = new_capacity
        group = dist.collective._get_default_group()
        task = group.process_group.all_reduce(new_capacity, dist.ReduceOp.MAX)
        task.wait()

    # Compute l_aux
    me = paddle.mean(gates, axis=0)
    ce = paddle.mean(mask1.astype("float32"), axis=0)
    l_aux = paddle.sum(me * ce) * num_experts

    # Random Token Selection
    if use_rts:
        device = paddle.get_device()
        uniform = exp_selection_uniform_map.get(device)
        if uniform is None:
            uniform = Uniform(
                low=paddle.to_tensor(0.0), high=paddle.to_tensor(1.0)).rsample
            exp_selection_uniform_map[device] = uniform

        mask1_rand = mask1 * uniform(mask1.shape)
    else:
        mask1_rand = mask1

    assert logits.shape[
        0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."

    top_idx = _top_idx(mask1_rand, capacity)
    new_mask1 = paddle.zeros_like(mask1).put_along_axis_(
        indices=top_idx, values=1., axis=0)
    mask1 *= new_mask1

    # Compute locations in capacity buffer

    with paddle.amp.auto_cast(False, level='O2'):
        locations1 = paddle.cumsum(mask1.astype(paddle.float32), axis=0) - 1
        # Store the capacity location for each token
        locations1_s = paddle.sum(locations1 * mask1.astype(paddle.float32), axis=1)

    # Normalize gate probabilities
    mask1_float = mask1.astype("float32")
    gates = gates * mask1_float

    assert(0 <= locations1_s.astype(paddle.int32).min() and locations1_s.astype(paddle.int32).max() < capacity)
    locations1_sc = F.one_hot(locations1_s.astype(paddle.int32),
                              capacity).astype(paddle.float32)

    combine_weights = einsum("se,sc->sec", gates, locations1_sc)

    dispatch_mask = combine_weights.astype("bool")

    return l_aux, combine_weights, dispatch_mask, exp_counts


def top2gating(logits: Tensor, capacity_factor: float,
               min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
    """Implements Top2Gating on logits."""
    # everything is in fp32 in this function
    gates = F.softmax(logits, axis=1)

    capacity = _capacity(gates,
                         paddle.to_tensor(capacity_factor * 2),
                         paddle.to_tensor(min_capacity))

    # Create a mask for 1st's expert per token
    indices1_s = paddle.argmax(gates, axis=1)
    num_experts = int(gates.shape[1])
    mask1 = F.one_hot(indices1_s, num_classes=num_experts)

    # Create a mask for 2nd's expert per token using Gumbel-max trick
    # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
    logits_w_noise = logits + gumbel_rsample(logits.shape)
    # Replace top-expert with min value
    # logits_except1 = logits_w_noise.masked_fill(mask1.astype("bool"), float("-inf"))
    logits_except1 = paddle.where(
        mask1.astype("bool"),
        paddle.ones(logits_w_noise.shape) * float("-inf"), logits_w_noise)
    indices2_s = paddle.argmax(logits_except1, axis=1)
    mask2 = F.one_hot(indices2_s, num_classes=num_experts)

    # Compute locations in capacity buffer
    locations1 = paddle.cumsum(mask1, axis=0) - 1
    locations2 = paddle.cumsum(mask2, axis=0) - 1
    # Update 2nd's location by accounting for locations of 1st
    locations2 += paddle.sum(mask1, axis=0, keepdim=True)

    # gating decisions
    exp_counts = paddle.sum(mask1, axis=0).detach()

    # Compute l_aux
    me = paddle.mean(gates, axis=0)
    ce = paddle.mean(mask1.astype("float32"), axis=0)
    l_aux = paddle.mean(me * ce) * num_experts * num_experts

    # Remove locations outside capacity from mask
    mask1 *= paddle.less_than(locations1, capacity)
    mask2 *= paddle.less_than(locations2, capacity)

    # Store the capacity location for each token
    locations1_s = paddle.sum(locations1 * mask1, axis=1)
    locations2_s = paddle.sum(locations2 * mask2, axis=1)

    # Normalize gate probabilities
    mask1_float = mask1.astype("float32")
    mask2_float = mask2.astype("float32")
    gates1_s = einsum("se,se->s", gates, mask1_float)
    gates2_s = einsum("se,se->s", gates, mask2_float)
    denom_s = gates1_s + gates2_s
    # Avoid divide-by-zero
    # HACK: paddle currently does not support finfo, use constant instead
    min_constant = 1.1920928955078125e-07
    denom_s = paddle.clip(denom_s, min=min_constant)
    gates1_s /= denom_s
    gates2_s /= denom_s

    # Calculate combine_weights and dispatch_mask
    gates1 = einsum("s,se->se", gates1_s, mask1_float)
    gates2 = einsum("s,se->se", gates2_s, mask2_float)
    locations1_sc = F.one_hot(locations1_s, capacity)
    locations2_sc = F.one_hot(locations2_s, capacity)
    combine1_sec = einsum("se,sc->sec", gates1, locations1_sc)
    combine2_sec = einsum("se,sc->sec", gates2, locations2_sc)
    combine_weights = combine1_sec + combine2_sec
    dispatch_mask = combine_weights.astype("bool")

    return l_aux, combine_weights, dispatch_mask, exp_counts


class TopKGate(nn.Layer):
    """Gate module which implements Top2Gating as described in Gshard_.
    ::

        gate = TopKGate(model_dim, num_experts)
        l_aux, combine_weights, dispatch_mask = gate(input)

    .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf

    Args:
        model_dim (int):
            size of model embedding dimension
        num_experts (ints):
            number of experts in model
    """

    wg: nn.Linear

    def __init__(self,
                 model_dim: int,
                 num_experts: int,
                 k: int=1,
                 capacity_factor: float=1.0,
                 eval_capacity_factor: float=1.0,
                 min_capacity: int=8,
                 noisy_gate_policy: Optional[str]=None,
                 drop_tokens: bool=True,
                 use_rts: bool=True) -> None:
        super().__init__()

        # Only top-1 and top-2 are supported at the moment.
        if k != 1 and k != 2:
            raise ValueError('Only top-1 and top-2 gatings are supported.')
        self.wg = nn.Linear(model_dim, num_experts).to(dtype=paddle.float32)
        self.k = k
        self.capacity_factor = capacity_factor
        self.eval_capacity_factor = eval_capacity_factor
        self.min_capacity = min_capacity
        self.noisy_gate_policy = noisy_gate_policy
        # self.timers = SynchronizedWallClockTimer()
        self.wall_clock_breakdown = False
        self.gate_time = 0.0
        self.drop_tokens = drop_tokens
        self.use_rts = use_rts

    def forward(self, input: paddle.Tensor, used_token: paddle.Tensor=None
                ) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore

        # if self.wall_clock_breakdown:
        #     self.timers('TopKGate').start()

        if self.wg.weight.dtype != paddle.float32:
            self.wg = self.wg.to(dtype=paddle.float32)
        input_fp32 = input.astype("float32")
        # input jittering
        if self.noisy_gate_policy == 'Jitter' and self.training:
            input_fp32 = multiplicative_jitter(input_fp32)
        logits = self.wg(input_fp32)

        if self.k == 1:
            gate_output = top1gating(
                logits, self.capacity_factor
                if self.training else self.eval_capacity_factor,
                self.min_capacity, used_token, self.noisy_gate_policy
                if self.training else None, self.drop_tokens, self.use_rts)

        else:
            gate_output = top2gating(logits, self.capacity_factor
                                     if self.training else
                                     self.eval_capacity_factor,
                                     self.min_capacity)

        # if self.wall_clock_breakdown:
        #     self.timers('TopKGate').stop()
        #     self.gate_time = self.timers('TopKGate').elapsed(reset=False)

        return gate_output


class MOELayer(nn.Layer):

    def __init__(self,
                 gate: nn.Layer,
                 experts: nn.Layer,
                 ep_group_name,
                 ep_size,
                 num_local_experts: int) -> None:
        super().__init__()
        self.gate = gate
        self.experts = experts
        self.ep_group = None
        self.ep_size = ep_size
        self.ep_group_name = ep_group_name
        self.num_local_experts = num_local_experts
        self.time_falltoall = 0.0
        self.time_salltoall = 0.0
        self.time_moe = 0.0
        # self.timers = SynchronizedWallClockTimer()
        self.wall_clock_breakdown = False
        #HACK need fix
        # self.hcg = fleet.get_hybrid_communicate_group()
        self.hcg = None

    def _set_ep_group(self, ep_group):
        self.ep_group = ep_group

    def get_loss(self):
        return self.l_aux

    def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:

        # if self.wall_clock_breakdown:
        #     self.timers('moe').start()

        # Implement Algorithm 2 from GShard paper.
        d_model = input[0].shape[-1]

        # Initial implementation -> Reshape into S tokens by dropping sequence dimension.
        # Reshape into G groups so that each group can distribute tokens equally
        # group_size = kwargs['group_size'] if 'group_size' in kwargs.keys() else 1
        reshaped_input = input[0].reshape((-1, d_model))

        self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate(
            reshaped_input, input[1])
        dispatched_input = einsum("sec,sm->ecm",
                                  dispatch_mask.astype(input[0].dtype),
                                  reshaped_input)

        # if self.wall_clock_breakdown:
        #     self.timers('falltoall').start()

        # HACK: _get_expert_model_parallel_world_size is needed here
        if False and self.hcg.get_model_parallel_group().nranks == 1:
            # If the non-expert is tensor-parallel, it will create
            # duplicate tokens on the tensor-parallel ranks.
            # Since our experts are not tensor-parallel, these duplicates
            # need to be dropped to ensure correctness.
            # this also doubles up as a communication optimization as we are
            # reducing the all-to-all communication volume.
            dispatched_input = drop_tokens(dispatched_input, axis=1)

        # HACK disable AllToAll
        # dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)

        # if self.wall_clock_breakdown:
        #     self.timers('falltoall').stop()
        #     self.time_falltoall = self.timers('falltoall').elapsed(reset=False)

        # Re-shape after all-to-all: ecm -> gecm
        dispatched_input = dispatched_input.reshape(
            (self.ep_size, self.num_local_experts, -1, d_model))

        expert_output = self.experts(dispatched_input)

        # if self.wall_clock_breakdown:
        #     self.timers('salltoall').start()

        # HACK disable AllToAll
        # expert_output = _AllToAll.apply(self.ep_group, expert_output)

        # if self.wall_clock_breakdown:
        #     self.timers('salltoall').stop()
        #     self.time_salltoall = self.timers('salltoall').elapsed(reset=False)

        # Re-shape back: gecm -> ecm
        expert_output = expert_output.reshape(
            (self.ep_size * self.num_local_experts, -1, d_model))

        # HACK: _get_expert_model_parallel_world_size is needed here
        if False and self.hcg.get_model_parallel_group().nranks == 1:
            # the dropped duplicate tokens need to be gathered on each
            # tensor parallel rank again for the tensor-parallel
            # non-expert of the next layer.
            expert_output = gather_tokens(expert_output, axis=1)

        combined_output = einsum("sec,ecm->sm",
                                 combine_weights.astype(input[0].dtype),
                                 expert_output)

        a = combined_output.reshape((input[0].shape))

        # if self.wall_clock_breakdown:
        #     self.timers('moe').stop()
        #     self.time_moe = self.timers('moe').elapsed(reset=False)

        return a


================================================
FILE: ppfleetx/models/language_model/t5/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .modeling import (finfo, ACT2FN, ModelOutput, get_t5_model,
                       t5_encode_text, get_encoded_dim)
from .utils import normal_, constant_init


================================================
FILE: ppfleetx/models/language_model/t5/modeling.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import copy
import json
import numpy as np
from collections import OrderedDict
from typing import Callable, List, Optional, Set, Tuple, Union, Any

import paddle
from paddle import nn

from ppfleetx.data.tokenizers.t5_tokenizer import (
    t5_tokenize, get_t5_tokenizer, DEFAULT_T5_NAME)
from ppfleetx.models.multimodal_model.imagen.utils import rearrange, exists, default


def finfo(dtype):
    if dtype == paddle.float32:
        return np.finfo(np.float32)
    if dtype == paddle.float16:
        return np.finfo(np.float16)
    if dtype == paddle.float64:
        return np.finfo(np.float64)


def fields(class_or_instance):
    """Return a tuple describing the fields of this dataclass.

    Accepts a dataclass or an instance of one. Tuple elements are of
    type Field.
    """

    # Might it be worth caching this, per class?
    try:
        fields = getattr(class_or_instance, _FIELDS)
    except AttributeError:
        raise TypeError('must be called with a dataclass type or instance')

    # Exclude pseudo-fields.  Note that fields is sorted by insertion
    # order, so the order of the tuple is as the fields were defined.
    return tuple(f for f in fields.values() if f._field_type is _FIELD)


def is_tensor(x):
    return isinstance(x, paddle.Tensor)


class ModelOutput(OrderedDict):
    """
    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
    python dictionary.

    <Tip warning={true}>

    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
    before.

    </Tip>
    """

    def __post_init__(self):
        class_fields = fields(self)

        # Safety and consistency checks
        if not len(class_fields):
            raise ValueError(f"{self.__class__.__name__} has no fields.")
        if not all(field.default is None for field in class_fields[1:]):
            raise ValueError(
                f"{self.__class__.__name__} should not have more than one required field."
            )

        first_field = getattr(self, class_fields[0].name)
        other_fields_are_none = all(
            getattr(self, field.name) is None for field in class_fields[1:])

        if other_fields_are_none and not is_tensor(first_field):
            if isinstance(first_field, dict):
                iterator = first_field.items()
                first_field_iterator = True
            else:
                try:
                    iterator = iter(first_field)
                    first_field_iterator = True
                except TypeError:
                    first_field_iterator = False

            # if we provided an iterator as first field and the iterator is a (key, value) iterator
            # set the associated fields
            if first_field_iterator:
                for element in iterator:
                    if (not isinstance(element, (list, tuple)) or
                            not len(element) == 2 or
                            not isinstance(element[0], str)):
                        break
                    setattr(self, element[0], element[1])
                    if element[1] is not None:
                        self[element[0]] = element[1]
            elif first_field is not None:
                self[class_fields[0].name] = first_field
        else:
            for field in class_fields:
                v = getattr(self, field.name)
                if v is not None:
                    self[field.name] = v

    def __delitem__(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance."
        )

    def setdefault(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance."
        )

    def pop(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")

    def update(self, *args, **kwargs):
        raise Exception(
            f"You cannot use ``update`` on a {self.__class__.__name__} instance."
        )

    def __getitem__(self, k):
        if isinstance(k, str):
            inner_dict = {k: v for (k, v) in self.items()}
            return inner_dict[k]
        else:
            return self.to_tuple()[k]

    def __setattr__(self, name, value):
        if name in self.keys() and value is not None:
            # Don't call self.__setitem__ to avoid recursion errors
            super().__setitem__(name, value)
        super().__setattr__(name, value)

    def __setitem__(self, key, value):
        # Will raise a KeyException if needed
        super().__setitem__(key, value)
        # Don't call self.__setattr__ to avoid recursion errors
        super().__setattr__(key, value)

    def to_tuple(self) -> Tuple[Any]:
        """
        Convert self to a tuple containing all the attributes/keys that are not `None`.
        """
        return tuple(self[k] for k in self.keys())


class NewGELUActivation(nn.Layer):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """

    def forward(self, input):
        return 0.5 * input * (1.0 + paddle.tanh(
            math.sqrt(2.0 / math.pi) *
            (input + 0.044715 * paddle.pow(input, 3.0))))


class GELUActivation(nn.Layer):
    """
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """

    def __init__(self, use_gelu_python: bool=False):
        super().__init__()
        self.act = nn.functional.gelu

    def _gelu_python(self, input):
        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))

    def forward(self, input):
        return self.act(input)


class FastGELUActivation(nn.Layer):
    """
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    """

    def forward(self, input):
        return 0.5 * input * (
            1.0 + paddle.tanh(input * 0.7978845608 *
                              (1.0 + 0.044715 * input * input)))


class QuickGELUActivation(nn.Layer):
    """
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    """

    def forward(self, input):
        return input * paddle.nn.functional.sigmoid(1.702 * input)


class ClippedGELUActivation(nn.Layer):
    """
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://arxiv.org/abs/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
    """

    def __init__(self, min: float, max: float):
        if min > max:
            raise ValueError(
                f"min should be < max (got min: {min}, max: {max})")

        super().__init__()
        self.min = min
        self.max = max

    def forward(self, x):
        return paddle.clip(gelu(x), self.min, self.max)


class SiLUActivation(nn.Layer):
    """
    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
    later.
    """

    def __init__(self):
        super().__init__()
        self.act = nn.functional.silu

    def _silu_python(self, input):
        return input * nn.functional.sigmoid(input)

    def forward(self, input):
        return self.act(input)


class MishActivation(nn.Layer):
    """
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    """

    def __init__(self):
        super().__init__()
        self.act = nn.functional.mish

    def _mish_python(self, input):
        return input * paddle.tanh(nn.functional.softplus(input))

    def forward(self, input):
        return self.act(input)


class LinearActivation(nn.Layer):
    """
    Applies the linear activation function, i.e. forwarding input directly to output.
    """

    def forward(self, input):
        return input


ACT2FN = {
    "gelu": GELUActivation(),
    "gelu_10": ClippedGELUActivation(-10, 10),
    "gelu_fast": FastGELUActivation(),
    "gelu_new": NewGELUActivation(),
    "gelu_python": GELUActivation(use_gelu_python=True),
    "linear": LinearActivation(),
    "mish": MishActivation(),
    "quick_gelu": QuickGELUActivation(),
    "relu": nn.ReLU(),
    "sigmoid": nn.Sigmoid(),
    "silu": SiLUActivation(),
    "swish": SiLUActivation(),
    "tanh": nn.Tanh(),
}


def get_activation(activation_string):
    if activation_string in ACT2FN:
        return ACT2FN[activation_string]
    else:
        raise KeyError(
            f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
        )


# For backwards compatibility with: from activations import gelu_python
gelu_python = get_activation("gelu_python")
gelu_new = get_activation("gelu_new")
gelu = get_activation("gelu")
gelu_fast = get_activation("gelu_fast")
quick_gelu = get_activation("quick_gelu")
silu = get_activation("silu")
mish = get_activation("mish")
linear_act = get_activation("linear")


def prune_linear_layer(layer: nn.Linear, index: paddle.int64,
                       dim: int=0) -> nn.Linear:
    """
    Prune a linear layer to keep only entries in index.

    Used to remove heads.

    Args:
        layer (`paddle.nn.Linear`): The layer to prune.
        index (`paddle.int64`): The indices to keep in the layer.
        dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.

    Returns:
        `paddle.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
    """
    W = layer.weight.index_select(dim, index).clone().detach()
    if layer.bias is not None:
        if dim == 1:
            b = layer.bias.clone().detach()
        else:
            b = layer.bias[index].clone().detach()
    new_size = list(layer.weight.size())
    new_size[dim] = len(index)
    new_layer = nn.Linear(
        new_size[1], new_size[0], bias_attr=layer.bias is not None)
    new_layer.weight.requires_grad = False
    new_layer.weight.copy_(W)
    new_layer.weight.stop_gradient = False
    if layer.bias is not None:
        new_layer.bias.stop_gradient = True
        new_layer.bias.copy_(b)
        new_layer.bias.stop_gradient = False
    return new_layer


def find_pruneable_heads_and_indices(heads,
                                     n_heads: int,
                                     head_size: int,
                                     already_pruned_heads):
    """
    Finds the heads and their indices taking `already_pruned_heads` into account.

    Args:
        heads : List of the indices of heads to prune.
        n_heads : The number of heads in the model.
        head_size : The size of each head.
        already_pruned_heads : A set of already pruned heads.

    Returns:
        A tuple with the remaining heads and their corresponding indices.
    """
    mask = paddle.ones(n_heads, head_size)
    heads = set(
        heads
    ) - already_pruned_heads  # Convert to set and remove already pruned heads
    for head in heads:
        # Compute how many pruned heads are before the head and move the index accordingly
        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
        mask[head] = 0
    mask = mask.reshape(-1).equal(1)
    index = paddle.arange(len(mask))[mask].cast(paddle.int64)
    return heads, index


class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

    last_hidden_state = None
    past_key_values = None
    hidden_states = None
    attentions = None
    cross_attentions = None


class T5Config(object):
    def __init__(self, **kwargs):

        # Fine-tuning task arguments
        self.architectures = kwargs.pop("architectures", None)
        self.use_return_dict = kwargs.pop("return_dict", True)
        self.d_ff = kwargs.pop("d_ff", None)
        self.d_kv = kwargs.pop("d_kv", None)
        self.d_model = kwargs.pop("d_model", None)
        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id",
                                                 None)
        self.dense_act_fn = kwargs.pop("dense_act_fn", 'gelu_new')
        self.eos_token_id = kwargs.pop("eos_token_id", None)
        self.feed_forward_proj = kwargs.pop("feed_forward_proj", None)
        self.initializer_factor = kwargs.pop("initializer_factor", None)
        self.is_decoder = kwargs.pop("is_decoder", False)
        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
        self.is_gated_act = kwargs.pop("is_gated_act", True)
        self.layer_norm_epsilon = kwargs.pop("layer_norm_epsilon", None)
        self.model_type = kwargs.pop("model_type", None)
        self.num_decoder_layers = kwargs.pop("num_decoder_layers", None)
        self.num_heads = kwargs.pop("num_heads", None)
        self.num_layers = kwargs.pop("num_layers", None)
        self.output_past = kwargs.pop("output_past", True)
        self.pad_token_id = kwargs.pop("pad_token_id", None)
        self.relative_attention_max_distance = kwargs.pop(
            "relative_attention_max_distance", 128)
        self.relative_attention_num_buckets = kwargs.pop(
            "relative_attention_num_buckets", None)
        self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        self.transformers_version = kwargs.pop("transformers_version", None)
        self.use_cache = kwargs.pop("use_cache", False)
        self.vocab_size = kwargs.pop("vocab_size", None)
        self.model_type = kwargs.pop("model_type", None)
        self.dropout_rate = kwargs.pop("dropout_rate", None)
        self.output_attentions = kwargs.pop("output_attentions", False)
        self.output_hidden_states = kwargs.pop("output_hidden_states", False)


class T5LayerNorm(nn.Layer):
    def __init__(self, hidden_size, eps=1e-6):
        super().__init__()
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = self.create_parameter(
            [hidden_size],
            default_initializer=nn.initializer.Constant(value=1.))
        self.variance_epsilon = eps

    def forward(self, hidden_states):

        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
        # half-precision inputs is done in fp32

        variance = hidden_states.cast(paddle.float32).pow(2).mean(
            -1, keepdim=True)
        hidden_states = hidden_states * paddle.rsqrt(variance +
                                                     self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
            hidden_states = hidden_states.cast(self.weight.dtype)

        return self.weight * hidden_states


class T5DenseActDense(nn.Layer):
    def __init__(self, d_model, d_ff, dropout_rate, dense_act_fn):
        super().__init__()
        self.wi = nn.Linear(d_model, d_ff, bias_attr=False)
        self.wo = nn.Linear(d_ff, d_model, bias_attr=False)
        self.dropout = nn.Dropout(dropout_rate)
        self.act = ACT2FN[dense_act_fn]

    def forward(self, hidden_states):
        hidden_states = self.wi(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states


class T5DenseGatedActDense(nn.Layer):
    def __init__(self, d_model, d_ff, dropout_rate, dense_act_fn):
        super().__init__()
        self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False)
        self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False)
        self.wo = nn.Linear(d_ff, d_model, bias_attr=False)
        self.dropout = nn.Dropout(dropout_rate)
        self.act = ACT2FN[dense_act_fn]

    def forward(self, hidden_states):
        hidden_gelu = self.act(self.wi_0(hidden_states))
        hidden_linear = self.wi_1(hidden_states)
        hidden_states = hidden_gelu * hidden_linear
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states


class T5LayerFF(nn.Layer):
    def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon,
                 feed_forward_proj):
        super().__init__()
        if feed_forward_proj == "gated-gelu":
            self.DenseReluDense = T5DenseGatedActDense(
                d_model, d_ff, dropout_rate, dense_act_fn)
        elif feed_forward_proj == "relu":
            self.DenseReluDense = T5DenseActDense(d_model, d_ff, dropout_rate,
                                                  feed_forward_proj)

        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, hidden_states):
        forwarded_states = self.layer_norm(hidden_states)
        forwarded_states = self.DenseReluDense(forwarded_states)
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states


class T5Attention(nn.Layer):
    def __init__(self,
                 is_decoder,
                 relative_attention_num_buckets,
                 d_model,
                 d_kv,
                 num_heads,
                 dropout_rate,
                 has_relative_attention_bias=False):
        super().__init__()
        self.is_decoder = is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias
        self.relative_attention_num_buckets = relative_attention_num_buckets
        self.d_model = d_model
        self.key_value_proj_dim = d_kv
        self.n_heads = num_heads
        self.dropout = dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # Mesh TensorFlow initialization to avoid scaling before softmax
        self.q = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias_attr=False)

        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(
                self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = False

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads)
        # Prune linear layers
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.key_value_proj_dim * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    @staticmethod
    def _relative_position_bucket(relative_position,
                                  bidirectional=True,
                                  num_buckets=32,
                                  max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (
                relative_position > 0).cast(paddle.int64) * num_buckets
            relative_position = paddle.abs(relative_position)
        else:
            relative_position = -paddle.min(
                relative_position, paddle.zeros_like(relative_position))
        # now relative_position is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        relative_position_if_large = max_exact + (
            paddle.log(relative_position.cast('float32') /
                       max_exact) / math.log(max_distance / max_exact) *
            (num_buckets - max_exact)).cast(paddle.int64)
        relative_position_if_large = paddle.minimum(
            relative_position_if_large,
            paddle.full_like(relative_position_if_large, num_buckets - 1))

        relative_buckets += paddle.where(is_small, relative_position,
                                         relative_position_if_large)
        return relative_buckets

    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        context_position = paddle.arange(
            query_length, dtype=paddle.int64)[:, None]
        memory_position = paddle.arange(
            key_length, dtype=paddle.int64)[None, :]
        relative_position = memory_position - context_position  # shape (query_length, key_length)
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # shape (query_length, key_length)
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets, )
        values = self.relative_attention_bias(
            relative_position_bucket
        )  # shape (query_length, key_length, num_heads)
        values = values.transpose([2, 0, 1]).unsqueeze(
            0)  # shape (1, num_heads, query_length, key_length)
        return values

    def forward(
            self,
            hidden_states,
            mask=None,
            key_value_states=None,
            position_bias=None,
            past_key_value=None,
            layer_head_mask=None,
            query_length=None,
            use_cache=False,
            output_attentions=False, ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
        batch_size, seq_length = hidden_states.shape[:2]

        real_seq_length = seq_length

        if past_key_value is not None:
            assert (
                len(past_key_value) == 2
            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
            real_seq_length += past_key_value[0].shape[
                2] if query_length is None else query_length

        key_length = real_seq_length if key_value_states is None else key_value_states.shape[
            1]

        def shape(states):
            """projection"""
            return states.reshape(
                [0, -1, self.n_heads, self.key_value_proj_dim]).transpose(
                    [0, 2, 1, 3])

        def unshape(states):
            """reshape"""
            return states.transpose([0, 2, 1, 3]).reshape(
                [batch_size, -1, self.inner_dim])

        def project(hidden_states, proj_layer, key_value_states,
                    past_key_value):
            """projects hidden states correctly to key/query states"""
            if key_value_states is None:
                # self-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(hidden_states))
            elif past_key_value is None:
                # cross-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(key_value_states))

            if past_key_value is not None:
                if key_value_states is None:
                    # self-attn
                    # (batch_size, n_heads, key_length, dim_per_head)
                    hidden_states = paddle.concat(
                        [past_key_value, hidden_states], axis=2)
                else:
                    # cross-attn
                    hidden_states = past_key_value
            return hidden_states

        # get query states
        query_states = shape(self.q(
            hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)

        # get key/value states
        key_states = project(hidden_states, self.k, key_value_states,
                             past_key_value[0]
                             if past_key_value is not None else None)
        value_states = project(hidden_states, self.v, key_value_states,
                               past_key_value[1]
                               if past_key_value is not None else None)

        # compute scores
        scores = paddle.matmul(
            query_states, key_states.transpose([0, 1, 3, 2])
        )  # equivalent of paddle.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9

        if position_bias is None:
            if not self.has_relative_attention_bias:
                position_bias = paddle.zeros(
                    (1, self.n_heads, real_seq_length, key_length),
                    dtype=scores.dtype)
                if self.gradient_checkpointing and self.training:
                    position_bias.requires_grad = True
            else:
                position_bias = self.compute_bias(real_seq_length, key_length)

            # if key and values are already calculated
            # we want only the last query position bias
            if past_key_value is not None:
                position_bias = position_bias[:, :, -hidden_states.size(1):, :]

            if mask is not None:
                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)

        scores += position_bias
        attn_weights = nn.functional.softmax(
            scores.cast('float32'), axis=-1).astype(
                scores.dtype)  # (batch_size, n_heads, seq_length, key_length)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.
            training)  # (batch_size, n_heads, seq_length, key_length)

        # Mask heads if we want to
        if layer_head_mask is not None:
            attn_weights = attn_weights * layer_head_mask

        attn_output = unshape(paddle.matmul(
            attn_weights, value_states))  # (batch_size, seq_length, dim)
        attn_output = self.o(attn_output)

        present_key_value_state = (key_states, value_states) if (
            self.is_decoder and use_cache) else None
        outputs = (attn_output, ) + (present_key_value_state, ) + (
            position_bias, )

        if output_attentions:
            outputs = outputs + (attn_weights, )
        return outputs


class T5LayerSelfAttention(nn.Layer):
    def __init__(self,
                 is_decoder,
                 relative_attention_num_buckets,
                 d_model,
                 d_kv,
                 num_heads,
                 dropout_rate,
                 layer_norm_epsilon,
                 has_relative_attention_bias=False):
        super().__init__()
        self.SelfAttention = T5Attention(
            is_decoder,
            relative_attention_num_buckets,
            d_model,
            d_kv,
            num_heads,
            dropout_rate,
            has_relative_attention_bias=has_relative_attention_bias)
        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(
            self,
            hidden_states,
            attention_mask=None,
            position_bias=None,
            layer_head_mask=None,
            past_key_value=None,
            use_cache=False,
            output_attentions=False, ):
        normed_hidden_states = self.layer_norm(hidden_states)
        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions, )
        hidden_states = hidden_states + self.dropout(attention_output[0])
        outputs = (hidden_states,
                   ) + attention_output[1:]  # add attentions if we output them
        return outputs


class T5LayerCrossAttention(nn.Layer):
    def __init__(self, is_decoder, relative_attention_num_buckets, d_model,
                 d_kv, num_heads, dropout_rate, layer_norm_epsilon):
        super().__init__()
        self.EncDecAttention = T5Attention(
            is_decoder,
            relative_attention_num_buckets,
            d_model,
            d_kv,
            num_heads,
            has_relative_attention_bias=False)
        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(
            self,
            hidden_states,
            key_value_states,
            attention_mask=None,
            position_bias=None,
            layer_head_mask=None,
            past_key_value=None,
            use_cache=False,
            query_length=None,
            output_attentions=False, ):
        normed_hidden_states = self.layer_norm(hidden_states)
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions, )
        layer_output = hidden_states + self.dropout(attention_output[0])
        outputs = (layer_output,
                   ) + attention_output[1:]  # add attentions if we output them
        return outputs


class T5Block(nn.Layer):
    def __init__(self,
                 is_decoder,
                 relative_attention_num_buckets,
                 feed_forward_proj,
                 d_model,
                 d_kv,
                 num_heads,
                 dropout_rate,
                 layer_norm_epsilon,
                 d_ff,
                 has_relative_attention_bias=False):
        super().__init__()
        self.is_decoder = is_decoder
        self.layer = nn.LayerList()
        self.layer.append(
            T5LayerSelfAttention(
                is_decoder,
                relative_attention_num_buckets,
                d_model,
                d_kv,
                num_heads,
                dropout_rate,
                layer_norm_epsilon,
                has_relative_attention_bias=has_relative_attention_bias))
        if self.is_decoder:
            self.layer.append(
                T5LayerCrossAttention(
                    is_decoder, relative_attention_num_buckets, d_model, d_kv,
                    num_heads, dropout_rate, layer_norm_epsilon))

        self.layer.append(
            T5LayerFF(d_model, d_ff, dropout_rate, layer_norm_epsilon,
                      feed_forward_proj))

    def forward(
            self,
            hidden_states,
            attention_mask=None,
            position_bias=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            encoder_decoder_position_bias=None,
            layer_head_mask=None,
            cross_attn_layer_head_mask=None,
            past_key_value=None,
            use_cache=False,
            output_attentions=False,
            return_dict=True, ):

        if past_key_value is not None:
            if not self.is_decoder:
                logger.warning(
                    "`past_key_values` is passed to the encoder. Please make sure this is intended."
                )
            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4

            if len(past_key_value) != expected_num_past_key_values:
                raise ValueError(
                    f"There should be {expected_num_past_key_values} past states. "
                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
                    f"Got {len(past_key_value)} past key / value states")

            self_attn_past_key_value = past_key_value[:2]
            cross_attn_past_key_value = past_key_value[2:]
        else:
            self_attn_past_key_value, cross_attn_past_key_value = None, None

        self_attention_outputs = self.layer[0](
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=self_attn_past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions, )
        hidden_states, present_key_value_state = self_attention_outputs[:2]
        attention_outputs = self_attention_outputs[
            2:]  # Keep self-attention outputs and relative position weights

        # clamp inf values to enable fp16 training
        if hidden_states.dtype == paddle.float16 and paddle.isinf(
                hidden_states).any():
            clamp_value = finfo(hidden_states.dtype).max - 1000
            hidden_states = paddle.clip(
                hidden_states, min=-clamp_value, max=clamp_value)

        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
        if do_cross_attention:
            # the actual query length is unknown for cross attention
            # if using past key value states. Need to inject it here
            if present_key_value_state is not None:
                query_length = present_key_value_state[0].shape[2]
            else:
                query_length = None

            cross_attention_outputs = self.layer[1](
                hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                position_bias=encoder_decoder_position_bias,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=cross_attn_past_key_value,
                query_length=query_length,
                use_cache=use_cache,
                output_attentions=output_attentions, )
            hidden_states = cross_attention_outputs[0]

            # clamp inf values to enable fp16 training
            if hidden_states.dtype == paddle.float16 and paddle.isinf(
                    hidden_states).any():
                clamp_value = finfo(hidden_states.dtype).max - 1000
                hidden_states = paddle.clip(
                    hidden_states, min=-clamp_value, max=clamp_value)

            # Combine self attn and cross attn key value states
            if present_key_value_state is not None:
                present_key_value_state = present_key_value_state + cross_attention_outputs[
                    1]

            # Keep cross-attention outputs and relative position weights
            attention_outputs = attention_outputs + cross_attention_outputs[2:]

        # Apply Feed Forward layer
        hidden_states = self.layer[-1](hidden_states)

        # clamp inf values to enable fp16 training
        if hidden_states.dtype == paddle.float16 and paddle.isinf(
                hidden_states).any():
            clamp_value = finfo(hidden_states.dtype).max - 1000
            hidden_states = paddle.clip(
                hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states, )

        if use_cache:
            outputs = outputs + (present_key_value_state, ) + attention_outputs
        else:
            outputs = outputs + attention_outputs

        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)


class T5Stack(nn.Layer):
    def __init__(self,
                 d_model,
                 num_layers,
                 layer_norm_epsilon,
                 dropout_rate,
                 relative_attention_num_buckets,
                 feed_forward_proj,
                 d_kv,
                 num_heads,
                 d_ff,
                 embed_tokens=None,
                 is_decoder=False):
        super().__init__()
        self.embed_tokens = embed_tokens
        self.is_decoder = is_decoder
        self.num_layers = num_layers

        self.block = nn.LayerList([
            T5Block(
                is_decoder,
                relative_attention_num_buckets,
                feed_forward_proj,
                d_model,
                d_kv,
                num_heads,
                dropout_rate,
                layer_norm_epsilon,
                d_ff,
                has_relative_attention_bias=bool(i == 0))
            for i in range(num_layers)
        ])
        self.final_layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
        self.dropout = nn.Dropout(dropout_rate)

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        self.embed_tokens = new_embeddings

    def get_extended_attention_mask(self, attention_mask, input_shape):
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`paddle.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        """
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Provided a padding mask of dimensions [batch_size, seq_length]
            # - if the model is a decoder, apply a causal mask in addition to the padding mask
            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
            )

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        #extended_attention_mask = extended_attention_mask.cast(dtype='float16')  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask

    def get_head_mask(self,
                      head_mask,
                      num_hidden_layers,
                      is_attention_chunked=False):
        """
        Prepare the head mask if needed.

        Args:
            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
            num_hidden_layers (`int`):
                The number of hidden layers in the model.
            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
                Whether or not the attentions scores are computed by chunks or not.

        Returns:
            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
            `[None]` for each layer.
        """
        if head_mask is not None:
            head_mask = self._convert_head_mask_to_5d(head_mask,
                                                      num_hidden_layers)
            if is_attention_chunked is True:
                head_mask = head_mask.unsqueeze(-1)
        else:
            head_mask = [None] * num_hidden_layers

        return head_mask

    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
        if head_mask.dim() == 1:
            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
                -1).unsqueeze(-1)
            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
        elif head_mask.dim() == 2:
            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
                -1)  # We can specify head_mask for each layer
        assert head_mask.dim(
        ) == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
        #head_mask = head_mask.cast(dtype=self.dtype)  # switch to float if need + fp16 compatibility
        return head_mask

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            inputs_embeds=None,
            head_mask=None,
            cross_attn_head_mask=None,
            past_key_values=None,
            use_cache=False,
            output_attentions=False,
            output_hidden_states=False,
            return_dict=True, ):
        if use_cache is True:
            assert (
                self.is_decoder
            ), f"`use_cache` can only be set to `True` if {self} is used as a decoder"

        output_hidden_states = (output_hidden_states
                                if output_hidden_states is not None else False)

        if input_ids is not None and inputs_embeds is not None:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.shape
            input_ids = input_ids.reshape([-1, input_shape[-1]])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
        else:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
            )

        if inputs_embeds is None:
            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)

        batch_size, seq_length = input_shape

        # required mask seq length can be calculated via length of past
        mask_seq_length = past_key_values[0][0].shape[
            2] + seq_length if past_key_values is not None else seq_length

        if use_cache is True:
            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder"

        if attention_mask is None:
            attention_mask = paddle.ones(batch_size, mask_seq_length)
        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
            encoder_seq_length = encoder_hidden_states.shape[1]
            encoder_attention_mask = paddle.ones(
                batch_size, encoder_seq_length, dtype=paddle.int64)

        # initialize past_key_values with `None` if past does not exist
        if past_key_values is None:
            past_key_values = [None] * len(self.block)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask = self.get_extended_attention_mask(
            attention_mask, input_shape)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
            encoder_hidden_shape = (encoder_batch_size,
                                    encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
            encoder_extended_attention_mask = self.invert_attention_mask(
                encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.num_layers)
        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask,
                                                  self.num_layers)
        present_key_value_states = () if use_cache else None
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        all_cross_attentions = () if (output_attentions and
                                      self.is_decoder) else None
        position_bias = None
        encoder_decoder_position_bias = None

        hidden_states = self.dropout(inputs_embeds)

        for i, (layer_module, past_key_value
                ) in enumerate(zip(self.block, past_key_values)):
            layer_head_mask = head_mask[i]
            cross_attn_layer_head_mask = cross_attn_head_mask[i]

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states, )

            layer_outputs = layer_module(
                hidden_states,
                attention_mask=extended_attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_extended_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
                layer_head_mask=layer_head_mask,
                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=past_key_value,
                use_cache=use_cache,
                output_attentions=output_attentions, )

            # layer_outputs is a tuple with:
            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
            if use_cache is False:
                layer_outputs = layer_outputs[:1] + (None,
                                                     ) + layer_outputs[1:]

            hidden_states, present_key_value_state = layer_outputs[:2]

            # We share the position biases between the layers - the first layer store them
            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
            # (cross-attention position bias), (cross-attention weights)
            position_bias = layer_outputs[2]
            if self.is_decoder and encoder_hidden_states is not None:
                encoder_decoder_position_bias = layer_outputs[
                    4 if output_attentions else 3]
            # append next layer key value states
            if use_cache:
                present_key_value_states = present_key_value_states + (
                    present_key_value_state, )

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[3], )
                if self.is_decoder:
                    all_cross_attentions = all_cross_attentions + (
                        layer_outputs[5], )

        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.dropout(hidden_states)

        # Add last layer
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states, )

        if not return_dict:
            return tuple(v
                         for v in [
                             hidden_states,
                             present_key_value_states,
                             all_hidden_states,
                             all_attentions,
                             all_cross_attentions,
                         ] if v is not None)
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=present_key_value_states,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions, )


class T5EncoderModel(nn.Layer):
    authorized_missing_keys = [r"encoder.embed_tokens.weight", ]

    def __init__(self,
                 vocab_size=32128,
                 d_model=768,
                 d_kv=64,
                 d_ff=3072,
                 num_layers=12,
                 num_decoder_layers=12,
                 num_heads=12,
                 relative_attention_num_buckets=32,
                 dropout_rate=0.1,
                 layer_norm_epsilon=1e-06,
                 feed_forward_proj="relu"):
        super().__init__()
        self.shared = nn.Embedding(vocab_size, d_model)
        # self.extra_parameters = list(self.shared.parameters())

        use_cache = False
        is_encoder_decoder = False
        self.encoder = T5Stack(
            d_model,
            num_layers,
            layer_norm_epsilon,
            dropout_rate,
            relative_attention_num_buckets,
            feed_forward_proj,
            d_kv,
            num_heads,
            d_ff,
            embed_tokens=self.shared,
            is_decoder=False)

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    def get_encoder(self):
        return self.encoder

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            head_mask=None,
            inputs_embeds=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None, ):
        r"""
        Returns:

        Example:

        ```python
        >>> from transformers import T5Tokenizer, T5EncoderModel

        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
        >>> model = T5EncoderModel.from_pretrained("t5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```"""
        return_dict = return_dict if return_dict is not None else True
        #import numpy as np
        #attention_mask = paddle.to_tensor(np.load('attn_mask.npy'))
        #input_ids = paddle.to_tensor(np.load('input_ids.npy'))
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict, )

        return encoder_outputs


def T5Model(config):
    config = T5Config(**config)
    model = T5EncoderModel(config)
    return model


def get_t5_model(name, pretrained=True):
    #t5_config = dict_from_json_file(name)
    #model = T5Model(t5_config)
    model = T5EncoderModel(
        vocab_size=32128,
        d_model=1024,
        d_kv=128,
        d_ff=65536,
        num_layers=2,
        num_decoder_layers=None,
        num_heads=128,
        relative_attention_num_buckets=32,
        dropout_rate=0.,
        layer_norm_epsilon=1e-06,
        feed_forward_proj="relu")
    if pretrained:
        checkpoint = paddle.load(name + '/t5.pd', return_numpy=True)
        model.set_state_dict(checkpoint['model'])
    model.eval()
    for p in model.parameters():
        p.stop_gradient = True

    return model


def t5_11b():
    return T5EncoderModel(
        vocab_size=32128,
        d_model=1024,
        d_kv=128,
        d_ff=65536,
        num_layers=24,
        num_decoder_layers=None,
        num_heads=128,
        relative_attention_num_buckets=32,
        dropout_rate=0.,
        layer_norm_epsilon=1e-06,
        feed_forward_proj="relu")


def dict_from_json_file(name):
    with open(name + '/config.json', "r", encoding="utf-8") as reader:
        text = reader.read()
        config_dict = json.loads(text)
        return config_dict


def t5_encode_text(t5, texts, tokenizer, return_attn_mask=False):
    token_ids, attn_mask = t5_tokenize(texts, tokenizer)
    t5.eval()
    with paddle.no_grad():
        encoded_text = t5(input_ids=token_ids, attention_mask=attn_mask)
        text_features = encoded_text.last_hidden_state.detach()

    if return_attn_mask:
        #attn_mask = attn_mask.cast('bool')
        return text_features, attn_mask

    return text_features


def get_encoded_dim(name):
    return dict_from_json_file(name)['d_model']


================================================
FILE: ppfleetx/models/language_model/t5/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from paddle.nn.initializer import TruncatedNormal, Constant, Normal

trunc_normal_ = TruncatedNormal(std=0.02)
zeros_ = Constant(value=0.0)
ones_ = Constant(value=1.0)


@paddle.no_grad()
def constant_(x, value):
    temp_value = paddle.full(x.shape, value, x.dtype)
    x.set_value(temp_value)
    return x


@paddle.no_grad()
def normal_(x, mean=0., std=1.):
    temp_value = paddle.normal(mean, std, shape=x.shape)
    x.set_value(temp_value)
    return


def normal_init(layer, mean=0, std=1, bias=0):
    if hasattr(layer, 'weight') and layer.weight is not None:
        normal_(layer.weight, mean, std)
    else:
        normal_(layer, mean, std)
    if hasattr(layer, 'bias') and layer.bias is not None:
        constant_(layer.bias, bias)


def constant_init(layer, val, bias=0):
    if hasattr(layer, 'weight') and layer.weight is not None:
        constant_(layer.weight, val)
    if hasattr(layer, 'bias') and layer.bias is not None:
        constant_(layer.bias, bias)


================================================
FILE: ppfleetx/models/language_model/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import copy

import yaml
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.fluid import core
import argparse
from functools import reduce

from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger


def is_fused_matmul_bias_supported():
    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')
    else:
        return False


def process_inference_configs(config):
    """
    process inference configs for hybrid parallel
    """
    if 'Inference' not in config.keys():
        return

    configs = config['Inference']

    if configs['model_dir'] is None:
        configs['model_dir'] = config['Engine']['save_load']['output_dir']

    if configs['mp_degree'] is None:
        configs['mp_degree'] = config['Distributed']['mp_degree']


def process_model_configs(config):
    """
    process model configs for hybrid parallel
    """
    configs = config['Model']
    if configs['ffn_hidden_size'] is None:
        configs['ffn_hidden_size'] = 4 * configs['hidden_size']

    if configs['use_recompute']:
        if not configs['recompute_granularity']:
            configs['recompute_granularity'] = 'full'
        if not configs['no_recompute_layers']:
            configs['no_recompute_layers'] = []
        else:
            assert isinstance(configs['no_recompute_layers'],
                              list), "no_recompute_layers should be a list"
            for i in configs['no_recompute_layers']:
                assert isinstance(
                    i, int
                ), "all values in no_recompute_layers should be an integer"
            assert min(configs['no_recompute_layers']) >= 0, \
                "the min value in no_recompute_layers should >= 0"
            assert max(configs['no_recompute_layers']) < configs['num_layers'], \
                "the max value in no_recompute_layers should < num_layers"
            configs['no_recompute_layers'] = sorted(
                list(set(configs['no_recompute_layers'])))

    if configs['fused_linear'] and not is_fused_matmul_bias_supported():
        configs['fused_linear'] = False
        logging.warning(
            "The flag fused_linear only valid for cuda version higher than 11.6, "
            "but the paddle is compiled with cuda " + paddle.version.cuda())

    pp_degree = config.Distributed.pp_degree

    if pp_degree > 1:
        configs['virtual_pp_degree'] = 1 \
            if configs.get('virtual_pp_degree', None) is None \
            else configs['virtual_pp_degree']
        virtual_pp_degree = configs['virtual_pp_degree']
        num_layers = configs.num_layers

        if not (num_layers % (virtual_pp_degree * pp_degree)) == 0:
            assert virtual_pp_degree == 1, "virtual pp doesn't support uneven layer split."
            logger.warning(
                "The num_layers of the model is not divisible by pp_degree." \
                "Receive num_layers: {}, pp_degree: {}.".format(num_layers, pp_degree))
        else:
            assert (num_layers %
                (virtual_pp_degree * pp_degree)) == 0, \
                "The num_layers of the model should be divisible of pp_degree * virtual_pp_degree." \
                "Receive num_layers: {}, pp_degree: {}, virtual_pp_degree: {}.".format(
                num_layers, pp_degree, virtual_pp_degree)

        if virtual_pp_degree > 1:
            local_batch_size = config.Global.local_batch_size
            micro_batch_size = config.Global.micro_batch_size
            acc_steps = local_batch_size // micro_batch_size
            assert acc_steps % pp_degree == 0, "num of microbatches {} should be divisible of pp_degree {} when " \
                                               "using interleave pipeline".format(acc_steps, pp_degree)

        if virtual_pp_degree > 2:
            logger.warning(
                "Setting virtual_pp_degree > 2 may harm the throughput of the pipeline parallel."
            )
    else:
        if configs.get('virtual_pp_degree', None):
            logger.warning("virtual_pp_degree is unuseful.")


def process_optim_configs(config):
    """
    process optim configs for hybrid parallel
    """
    config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][
        'enable']

    nranks = dist.get_world_size()
    dp_degree = config['Distributed']['dp_degree']
    sharding_degree = config['Distributed']['sharding']['sharding_degree']
    if config['Optimizer']['tensor_fusion']:
        assert nranks == dp_degree * sharding_degree, \
            "tensor_fusion only support single card train or data/sharding parallel train"

    if config['Optimizer']['lr']['decay_steps'] is None:
        config['Optimizer']['lr']['decay_steps'] = config['Engine'][
            'max_steps']
    config['Optimizer']['lr']['decay_steps'] *= config['Global'][
        'global_batch_size']


def process_data_configs(config):
    """
    process data configs for hybrid parallel
    """
    cfg_global = config['Global']
    cfg_data = config['Data']

    mode_to_num_samples = {
        "Train":
        cfg_global['global_batch_size'] * config['Engine']['max_steps'],
        "Eval": cfg_global['global_batch_size'] *
        (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) *
        config['Engine']['eval_iters'],
        "Test":
        cfg_global['global_batch_size'] * config['Engine']['test_iters'],
    }

    for mode in ("Train", "Eval", "Test"):
        if mode in cfg_data.keys():
            cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[
                mode]
            cfg_data[mode]['dataset']['mode'] = mode
            cfg_data[mode]['dataset']['seed'] = cfg_global['seed']
            cfg_data[mode]['dataset']['model_type'] = config['Model']['name']
            cfg_data[mode]['sampler']['batch_size'] = cfg_global[
                'local_batch_size']


def process_configs(config):
    process_data_configs(config)
    process_model_configs(config)
    process_optim_configs(config)
    process_inference_configs(config)

    return config


================================================
FILE: ppfleetx/models/multimodal_model/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/multimodal_model/clip/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/multimodal_model/imagen/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .modeling import (ImagenModel, imagen_397M_text2im_64, imagen_text2im_64,
                       imagen_text2im_64_debertav2, imagen_SR256,
                       imagen_SR1024, ImagenCriterion)


================================================
FILE: ppfleetx/models/multimodal_model/imagen/modeling.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tqdm import tqdm
from functools import partial
from contextlib import contextmanager, nullcontext

import paddle
import paddle.nn.functional as F
from paddle import nn
import paddle.vision.transforms as T

from .unet import Unet
from ppfleetx.models.language_model.debertav2 import *
from ppfleetx.models.language_model.t5 import *
from ppfleetx.data.tokenizers import get_t5_tokenizer, get_debertav2_tokenizer
from .utils import (
    GaussianDiffusionContinuousTimes, default, exists, cast_tuple, first,
    maybe, eval_decorator, identity, pad_tuple_to_length, right_pad_dims_to,
    resize_image_to, normalize_neg_one_to_one, rearrange, repeat, reduce,
    unnormalize_zero_to_one, cast_uint8_images_to_float, is_float_dtype)


# predefined unets, with configs lining up with hyperparameters in appendix of paper
class Unet64_397M(Unet):
    def __init__(self, *args, **kwargs):
        default_kwargs = dict(
            dim=256,
            dim_mults=(1, 2, 3, 4),
            num_resnet_blocks=3,
            layer_attns=(False, True, True, True),
            layer_cross_attns=(False, True, True, True),
            attn_heads=8,
            ff_mult=2.,
            memory_efficient=False)
        super().__init__(*args, **{ ** default_kwargs, ** kwargs})


class BaseUnet64(Unet):
    def __init__(self, *args, **kwargs):
        default_kwargs = dict(
            dim=512,
            cond_dim=512,
            dim_mults=(1, 2, 3, 4),
            num_resnet_blocks=3,
            layer_attns=(False, True, True, True),
            layer_cross_attns=(False, True, True, True),
            attn_heads=8,
            ff_mult=2.,
            memory_efficient=False)
        super().__init__(*args, **{ ** default_kwargs, ** kwargs})


class SRUnet256(Unet):
    def __init__(self, *args, **kwargs):
        default_kwargs = dict(
            dim=128,
            dim_mults=(1, 2, 4, 8),
            num_resnet_blocks=(2, 4, 8, 8),
            layer_attns=(False, False, False, True),
            layer_cross_attns=(False, False, False, True),
            attn_heads=8,
            ff_mult=2.,
            memory_efficient=True)
        super().__init__(*args, **{ ** default_kwargs, ** kwargs})


class SRUnet1024(Unet):
    def __init__(self, *args, **kwargs):
        default_kwargs = dict(
            dim=128,
            dim_mults=(1, 2, 4, 8),
            num_resnet_blocks=(2, 4, 8, 8),
            layer_attns=False,
            layer_cross_attns=(False, False, False, True),
            attn_heads=8,
            ff_mult=2.,
            memory_efficient=True)
        super().__init__(*args, **{ ** default_kwargs, ** kwargs})


# main imagen ddpm class, which is a cascading DDPM from Ho et al.
class ImagenCriterion(nn.Layer):
    """
    Criterion for Imagen. It calculates the final loss.
    """

    def __init__(self, name='mse_loss', p2_loss_weight_k=1):
        super(ImagenCriterion, self).__init__()
        self.p2_loss_weight_k = p2_loss_weight_k

        if name == 'l1_loss':
            self.loss_func = F.l1_loss
        elif name == 'mse_loss':
            self.loss_func = F.mse_loss
        elif name == 'smooth_l1_loss':
            self.loss_func = F.smooth_l1_loss
        else:
            raise NotImplementedError()

    def forward(self, pred, target, log_snr, p2_loss_weight_gamma):
        """
        Args:
            pred(Tensor):
                The logits of prediction. Its data type should be float32 and
                its shape is [batch_size, sequence_length, vocab_size].
            target(Tensor):
                The labels of the prediction, default is noise.

        Returns:
            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].

        """
        losses = self.loss_func(pred, target, reduction="none")
        losses = reduce(losses, 'b ... -> b', 'mean')

        # p2 loss reweighting

        if p2_loss_weight_gamma > 0:
            loss_weight = (
                self.p2_loss_weight_k + log_snr.exp())**-p2_loss_weight_gamma
            losses = losses * loss_weight

        return losses.mean()


class ImagenModel(nn.Layer):
    def __init__(
            self,
            unets,
            image_sizes,
            text_encoder_name=None,
            text_embed_dim=1024,
            channels=3,
            timesteps=1000,
            cond_drop_prob=0.1,
            noise_schedules='cosine',
            pred_objectives='noise',
            random_crop_sizes=None,
            lowres_noise_schedule='linear',
            lowres_sample_noise_level=0.2,
            per_sample_random_aug_noise_level=False,
            condition_on_text=True,
            auto_normalize_img=True,
            p2_loss_weight_gamma=0.5,
            dynamic_thresholding=True,
            dynamic_thresholding_percentile=0.95,
            only_train_unet_number=None,
            is_sr=False,
            is_video=False,
            fused_linear=False, ):
        super().__init__()

        # conditioning hparams

        self.condition_on_text = condition_on_text
        self.unconditional = not condition_on_text
        self.is_sr = is_sr
        self.is_video = is_video

        # channels

        self.channels = channels

        # automatically take care of ensuring that first unet is unconditional
        # while the rest of the unets are conditioned on the low resolution image produced by previous unet

        unets = cast_tuple(unets)
        num_unets = len(unets)

        # determine noise schedules per unet

        timesteps = cast_tuple(timesteps, num_unets)

        # make sure noise schedule defaults to 'cosine', 'cosine', and then 'linear' for rest of super-resoluting unets

        noise_schedules = cast_tuple(noise_schedules)
        noise_schedules = pad_tuple_to_length(noise_schedules, 2, 'cosine')
        noise_schedules = pad_tuple_to_length(noise_schedules, num_unets,
                                              'linear')

        # construct noise schedulers

        noise_scheduler_klass = GaussianDiffusionContinuousTimes
        self.noise_schedulers = nn.LayerList([])

        for timestep, noise_schedule in zip(timesteps, noise_schedules):
            noise_scheduler = noise_scheduler_klass(
                noise_schedule=noise_schedule, timesteps=timestep)
            self.noise_schedulers.append(noise_scheduler)

        # randomly cropping for upsampler training

        self.random_crop_sizes = cast_tuple(random_crop_sizes, num_unets)
        assert not exists(
            first(self.random_crop_sizes)
        ), 'you should not need to randomly crop image during training for base unet, only for upsamplers - so pass in `random_crop_sizes = (None, 128, 256)` as example'

        # lowres augmentation noise schedule

        self.lowres_noise_schedule = GaussianDiffusionContinuousTimes(
            noise_schedule=lowres_noise_schedule)

        # ddpm objectives - predicting noise by default

        self.pred_objectives = cast_tuple(pred_objectives, num_unets)

        # get text encoder

        self.text_encoder_name = text_encoder_name

        if text_encoder_name is None:
            pass
        elif 't5' in text_encoder_name:
            self.text_embed_dim = default(
                text_embed_dim, lambda: get_encoded_dim(text_encoder_name))
            self.t5_encoder = get_t5_model(
                name=text_encoder_name, pretrained=True)
            self.tokenizer = get_t5_tokenizer(name=text_encoder_name)
            self.t5_encode_text = t5_encode_text
        elif 'deberta' in text_encoder_name:
            self.text_embed_dim = default(
                text_embed_dim,
                lambda: get_debertav2_encoded_dim(text_encoder_name))
            self.debertav2_encoder = get_debertav2_model(
                name=text_encoder_name, pretrained=True)
            self.tokenizer = get_debertav2_tokenizer(name=text_encoder_name)
            self.debertav2_encode_text = debertav2_encode_text
        else:
            raise NotImplementedError("Please implement the text encoder.")

        # construct unets

        self.unets = nn.LayerList([])

        self.unet_being_trained_index = -1  # keeps track of which unet is being trained at the moment
        self.only_train_unet_number = only_train_unet_number

        for ind, one_unet in enumerate(unets):
            assert isinstance(one_unet, Unet)
            is_first = ind == 0

            one_unet = one_unet.cast_model_parameters(
                cond_on_text=self.condition_on_text,
                text_embed_dim=self.text_embed_dim
                if self.condition_on_text else None,
                channels=self.channels,
                channels_out=self.channels)

            self.unets.append(one_unet)

        # unet image sizes

        image_sizes = cast_tuple(image_sizes)
        self.image_sizes = image_sizes

        self.sample_channels = cast_tuple(self.channels, num_unets)

        self.right_pad_dims_to_datatype = partial(
            rearrange, pattern=('b -> b 1 1 1'))

        # cascading ddpm related stuff

        self.lowres_sample_noise_level = lowres_sample_noise_level
        self.per_sample_random_aug_noise_level = per_sample_random_aug_noise_level

        # classifier free guidance

        self.cond_drop_prob = cond_drop_prob
        self.can_classifier_guidance = cond_drop_prob > 0.

        # normalize and unnormalize image functions

        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity
        self.input_image_range = (0. if auto_normalize_img else -1., 1.)

        # dynamic thresholding

        self.dynamic_thresholding = cast_tuple(dynamic_thresholding, num_unets)
        self.dynamic_thresholding_percentile = dynamic_thresholding_percentile

        # p2 loss weight

        self.p2_loss_weight_gamma = cast_tuple(p2_loss_weight_gamma, num_unets)

        assert all([
            (gamma_value <= 2) for gamma_value in self.p2_loss_weight_gamma
        ]), 'in paper, they noticed any gamma greater than 2 is harmful'

        # one temp parameter for keeping track of device

    def get_unet(self, unet_number):
        assert 0 < unet_number <= len(self.unets)
        index = unet_number - 1

        if isinstance(self.unets, nn.LayerList):
            unets_list = [unet for unet in self.unets]
            delattr(self, 'unets')
            self.unets = unets_list
        self.unet_being_trained_index = index
        return self.unets[index]

    def reset_unets(self, ):
        self.unets = nn.LayerList([*self.unets])
        self.unet_being_trained_index = -1

    @contextmanager
    def one_unet_in_gpu(self, unet_number=None, unet=None):
        assert exists(unet_number) ^ exists(unet)

        if exists(unet_number):
            unet = self.unets[unet_number - 1]

        yield

    def reset_unets_all(self, ):
        self.unets = nn.LayerList([*self.unets])
        self.unet_being_trained_index = -1

    # overriding state dict functions

    def state_dict(self, *args, **kwargs):
        self.reset_unets()
        return super().state_dict(*args, **kwargs)

    def load_state_dict(self, *args, **kwargs):
        self.reset_unets_all()
        return self.unets[self.unet_being_trained_index].set_state_dict(
            *args, **kwargs)

    # gaussian diffusion methods

    def p_mean_variance(self,
                        unet,
                        x,
                        t,
                        *,
                        noise_scheduler,
                        text_embeds=None,
                        text_mask=None,
                        cond_images=None,
                        lowres_cond_img=None,
                        self_cond=None,
                        lowres_noise_times=None,
                        cond_scale=1.,
                        model_output=None,
                        t_next=None,
                        pred_objective='noise',
                        dynamic_threshold=True):
        assert not (
            cond_scale != 1. and not self.can_classifier_guidance
        ), 'imagen was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)'
        time_var = noise_scheduler.get_condition(t)
        pred = default(model_output, lambda: unet.forward_with_cond_scale(x, time_var, text_embeds = text_embeds, text_mask = text_mask, cond_images = cond_images, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, lowres_noise_times = self.lowres_noise_schedule.get_condition(lowres_noise_times)))

        if pred_objective == 'noise':
            x_start = noise_scheduler.predict_start_from_noise(
                x, t=t, noise=pred)
        elif pred_objective == 'x_start':
            x_start = pred
        elif pred_objective == 'v':
            x_start = noise_scheduler.predict_start_from_v(x, t=t, v=pred)
        else:
            raise ValueError(f'unknown objective {pred_objective}')

        if dynamic_threshold:
            # following pseudocode in appendix
            # s is the dynamic threshold, determined by percentile of absolute values of reconstructed sample per batch element
            s = paddle.quantile(
                rearrange(x_start, 'b ... -> b (...)').abs(),
                self.dynamic_thresholding_percentile,
                axis=-1)

            s.clip_(min=1.)
            s = right_pad_dims_to(x_start, s)
            x_start = x_start.clip(-s, s) / s
        else:
            x_start.clip_(-1., 1.)

        mean_and_variance = noise_scheduler.q_posterior(
            x_start=x_start, x_t=x, t=t, t_next=t_next)
        return mean_and_variance, x_start

    @paddle.no_grad()
    def p_sample(self,
                 unet,
                 x,
                 t,
                 *,
                 noise_scheduler,
                 t_next=None,
                 text_embeds=None,
                 text_mask=None,
                 cond_images=None,
                 cond_scale=1.,
                 self_cond=None,
                 lowres_cond_img=None,
                 lowres_noise_times=None,
                 pred_objective='noise',
                 dynamic_threshold=True):
        b = x.shape[0]
        (model_mean, _, model_log_variance), x_start = self.p_mean_variance(
            unet,
            x=x,
            t=t,
            t_next=t_next,
            noise_scheduler=noise_scheduler,
            text_embeds=text_embeds,
            text_mask=text_mask,
            cond_images=cond_images,
            cond_scale=cond_scale,
            lowres_cond_img=lowres_cond_img,
            self_cond=self_cond,
            lowres_noise_times=lowres_noise_times,
            pred_objective=pred_objective,
            dynamic_threshold=dynamic_threshold)
        noise = paddle.randn(shape=x.shape, dtype=x.dtype)
        # no noise when t == 0
        is_last_sampling_timestep = (t_next == 0) if isinstance(
            noise_scheduler, GaussianDiffusionContinuousTimes) else (t == 0)
        nonzero_mask = (1 - is_last_sampling_timestep.cast('float32')).reshape(
            [b, *((1, ) * (len(x.shape) - 1))])
        pred = model_mean + nonzero_mask * (0.5 * model_log_variance
                                            ).exp() * noise
        return pred, x_start

    @paddle.no_grad()
    def p_sample_loop(self,
                      unet,
                      shape,
                      *,
                      noise_scheduler,
                      lowres_cond_img=None,
                      lowres_noise_times=None,
                      text_embeds=None,
                      text_mask=None,
                      cond_images=None,
                      inpaint_images=None,
                      inpaint_masks=None,
                      inpaint_resample_times=5,
                      init_images=None,
                      skip_steps=None,
                      cond_scale=1,
                      pred_objective='noise',
                      dynamic_threshold=True):

        batch = shape[0]
        img = paddle.randn(shape)

        # for initialization with an image or video

        if exists(init_images):
            img += init_images

        # keep track of x0, for self conditioning

        x_start = None

        # prepare inpainting

        has_inpainting = exists(inpaint_images) and exists(inpaint_masks)
        resample_times = inpaint_resample_times if has_inpainting else 1

        if has_inpainting:
            inpaint_images = self.normalize_img(inpaint_images)
            inpaint_images = resize_image_to(inpaint_images, shape[-1])
            inpaint_masks = resize_image_to(
                rearrange(inpaint_masks, 'b ... -> b 1 ...').cast('float32'),
                shape[-1]).cast('bool')

        # time

        timesteps = noise_scheduler.get_sampling_timesteps(batch)

        # whether to skip any steps

        skip_steps = default(skip_steps, 0)
        timesteps = timesteps[skip_steps:]

        for times, times_next in tqdm(
                timesteps, desc='sampling loop time step',
                total=len(timesteps)):
            is_last_timestep = times_next == 0

            for r in reversed(range(resample_times)):
                is_last_resample_step = r == 0

                if has_inpainting:
                    noised_inpaint_images, *_ = noise_scheduler.q_sample(
                        inpaint_images, t=times)
                    img = img * ~inpaint_masks + noised_inpaint_images * inpaint_masks

                self_cond = x_start if unet.self_cond else None

                img, x_start = self.p_sample(
                    unet,
                    img,
                    times,
                    t_next=times_next,
                    text_embeds=text_embeds,
                    text_mask=text_mask,
                    cond_images=cond_images,
                    cond_scale=cond_scale,
                    self_cond=self_cond,
                    lowres_cond_img=lowres_cond_img,
                    lowres_noise_times=lowres_noise_times,
                    noise_scheduler=noise_scheduler,
                    pred_objective=pred_objective,
                    dynamic_threshold=dynamic_threshold)

                if has_inpainting and not (is_last_resample_step or
                                           paddle.all(is_last_timestep)):
                    renoised_img = noise_scheduler.q_sample_from_to(
                        img, times_next, times)

                    img = paddle.where(
                        self.right_pad_dims_to_datatype(is_last_timestep), img,
                        renoised_img)

        img.clip_(-1., 1.)

        # final inpainting

        if has_inpainting:
            img = img * ~inpaint_masks + inpaint_images * inpaint_masks

        unnormalize_img = self.unnormalize_img(img)
        return unnormalize_img

    @paddle.no_grad()
    @eval_decorator
    def sample(
            self,
            texts=None,
            text_masks=None,
            text_embeds=None,
            cond_images=None,
            inpaint_images=None,
            inpaint_masks=None,
            inpaint_resample_times=5,
            init_images=None,
            skip_steps=None,
            batch_size=1,
            cond_scale=1.,
            lowres_sample_noise_level=None,
            start_at_unet_number=1,
            start_image_or_video=None,
            stop_at_unet_number=None,
            return_all_unet_outputs=True,
            return_pil_images=False, ):
        self.reset_unets()

        cond_images = maybe(cast_uint8_images_to_float)(cond_images)

        if exists(texts) and not exists(
                text_embeds) and not self.unconditional:
            with paddle.amp.auto_cast(enable=False):
                if 't5' in self.text_encoder_name:
                    text_embeds, text_masks = self.t5_encode_text(
                        t5=self.t5_encoder, texts=texts, return_attn_mask=True)
                elif 'debert' in self.text_encoder_name:
                    text_embeds, text_masks = self.debertav2_encode_text(
                        debertav2=self.debertav2_encoder,
                        texts=texts,
                        return_attn_mask=True)

        if not self.unconditional:
            text_masks = default(
                text_masks, lambda: paddle.any(text_embeds != 0., axis=-1))
            batch_size = text_embeds.shape[0]

        if exists(inpaint_images):
            if self.unconditional:
                if batch_size == 1:  # assume researcher wants to broadcast along inpainted images
                    batch_size = inpaint_images.shape[0]

            assert inpaint_images.shape[
                0] == batch_size, 'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=<int>)``'
            assert not (
                self.condition_on_text and
                inpaint_images.shape[0] != text_embeds.shape[0]
            ), 'number of inpainting images must be equal to the number of text to be conditioned on'

        assert not (
            self.condition_on_text and not exists(text_embeds)
        ), 'text or text encodings must be passed into imagen if specified'
        assert not (
            not self.condition_on_text and exists(text_embeds)
        ), 'imagen specified not to be conditioned on text, yet it is presented'
        assert not (
            exists(text_embeds) and
            text_embeds.shape[-1] != self.text_embed_dim
        ), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'

        assert not (
            exists(inpaint_images) ^ exists(inpaint_masks)
        ), 'inpaint images and masks must be both passed in to do inpainting'

        outputs = []

        lowres_sample_noise_level = default(lowres_sample_noise_level,
                                            self.lowres_sample_noise_level)

        num_unets = len(self.unets)

        # condition scaling

        cond_scale = cast_tuple(cond_scale, num_unets)

        # for initial image and skipping steps

        init_images = cast_tuple(init_images, num_unets)
        init_images = [
            maybe(self.normalize_img)(init_image) for init_image in init_images
        ]

        skip_steps = cast_tuple(skip_steps, num_unets)

        # handle starting at a unet greater than 1, for training only-upscaler training

        if start_at_unet_number > 1:

            assert not exists(stop_at_unet_number
                              ) or start_at_unet_number <= stop_at_unet_number
            assert exists(
                start_image_or_video
            ), 'starting image or video must be supplied if only doing upscaling'

            prev_image_size = self.image_sizes[start_at_unet_number - 1]
            img = resize_image_to(start_image_or_video, prev_image_size)

        # go through each unet in cascade

        for unet_number, unet, channel, image_size, noise_scheduler, pred_objective, dynamic_threshold, unet_cond_scale, unet_init_images, unet_skip_steps in tqdm(
                zip(
                    range(1, num_unets + 1), self.unets, self.sample_channels,
                    self.image_sizes, self.noise_schedulers,
                    self.pred_objectives, self.dynamic_thresholding,
                    cond_scale, init_images, skip_steps)):

            lowres_cond_img = lowres_noise_times = None
            shape = (batch_size, channel, image_size, image_size)

            if unet.lowres_cond:
                lowres_noise_times = self.lowres_noise_schedule.get_times(
                    batch_size, lowres_sample_noise_level)

                lowres_cond_img = resize_image_to(img, image_size)
                lowres_cond_img = self.normalize_img(lowres_cond_img)
                lowres_cond_img, *_ = self.lowres_noise_schedule.q_sample(
                    x_start=lowres_cond_img,
                    t=lowres_noise_times,
                    noise=paddle.randn(
                        shape=lowres_cond_img.shape,
                        dtype=lowres_cond_img.dtype))

            if exists(unet_init_images):
                unet_init_images = resize_image_to(unet_init_images,
                                                   image_size)

            shape = (batch_size, self.channels, image_size, image_size)

            img = self.p_sample_loop(
                unet,
                shape,
                text_embeds=text_embeds,
                text_mask=text_masks,
                cond_images=cond_images,
                inpaint_images=inpaint_images,
                inpaint_masks=inpaint_masks,
                inpaint_resample_times=inpaint_resample_times,
                init_images=unet_init_images,
                skip_steps=unet_skip_steps,
                cond_scale=unet_cond_scale,
                lowres_cond_img=lowres_cond_img,
                lowres_noise_times=lowres_noise_times,
                noise_scheduler=noise_scheduler,
                pred_objective=pred_objective,
                dynamic_threshold=dynamic_threshold)

            outputs.append(img)

            if exists(stop_at_unet_number
                      ) and stop_at_unet_number == unet_number:
                break

        output_index = -1 if not return_all_unet_outputs else slice(
            None)  # either return last unet output or all unet outputs

        if not return_pil_images:
            return outputs[output_index]

        if not return_all_unet_outputs:
            outputs = outputs[-1:]

        pil_images = list(
            map(lambda img: list(map(T.ToPILImage(), img.unbind(dim=0))),
                outputs))

        return pil_images[
            output_index]  # now you have a bunch of pillow images you can just .save(/where/ever/you/want.png)

    def p_losses(self,
                 unet,
                 x_start,
                 times,
                 *,
                 noise_scheduler,
                 lowres_cond_img=None,
                 lowres_aug_times=None,
                 text_embeds=None,
                 text_mask=None,
                 cond_images=None,
                 noise=None,
                 times_next=None,
                 pred_objective='noise',
                 p2_loss_weight_gamma=0.,
                 random_crop_size=None):
        is_video = x_start.ndim == 5

        noise = default(noise, lambda: paddle.randn(shape=x_start.shape, dtype=x_start.dtype))

        # normalize to [-1, 1]

        x_start = self.normalize_img(x_start)
        lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)

        # random cropping during training
        # for upsamplers

        if exists(random_crop_size):
            if is_video:
                frames = x_start.shape[2]
                x_start, lowres_cond_img, noise = rearrange_many(
                    (x_start, lowres_cond_img,
                     noise), 'b c f h w -> (b f) c h w')

            aug = K.RandomCrop((random_crop_size, random_crop_size), p=1.)

            # make sure low res conditioner and image both get augmented the same way
            # detailed https://kornia.readthedocs.io/en/latest/augmentation.module.html?highlight=randomcrop#kornia.augmentation.RandomCrop
            x_start = aug(x_start)
            lowres_cond_img = aug(lowres_cond_img, params=aug._params)
            noise = aug(noise, params=aug._params)

            if is_video:
                x_start, lowres_cond_img, noise = rearrange_many(
                    (x_start, lowres_cond_img, noise),
                    '(b f) c h w -> b c f h w',
                    f=frames)

        # get x_t

        x_noisy, log_snr, alpha, sigma = noise_scheduler.q_sample(
            x_start=x_start, t=times, noise=noise)

        # also noise the lowres conditioning image
        # at sample time, they then fix the noise level of 0.1 - 0.3

        lowres_cond_img_noisy = None
        if exists(lowres_cond_img):
            lowres_aug_times = default(lowres_aug_times, times)
            lowres_cond_img_noisy, *_ = self.lowres_noise_schedule.q_sample(
                x_start=lowres_cond_img,
                t=lowres_aug_times,
                noise=paddle.randn(
                    shape=lowres_cond_img.shape, dtype=lowres_cond_img.dtype))

        # time condition

        noise_cond = noise_scheduler.get_condition(times)

        # unet kwargs

        unet_kwargs = dict(
            text_embeds=text_embeds,
            text_mask=text_mask,
            cond_images=cond_images,
            lowres_noise_times=self.lowres_noise_schedule.get_condition(
                lowres_aug_times),
            lowres_cond_img=lowres_cond_img_noisy,
            cond_drop_prob=self.cond_drop_prob, )

        # self condition if needed

        # Because 'unet' can be an instance of DistributedDataParallel coming from the
        # ImagenTrainer.unet_being_trained when invoking ImagenTrainer.forward(), we need to
        # access the member 'module' of the wrapped unet instance.
        self_cond = unet._layers.self_cond if isinstance(
            unet, paddle.DataParallel) else unet.self_cond

        if self_cond and random() < 0.5:
            with paddle.no_grad():
                pred = unet.forward(x_noisy, noise_cond,
                                    **unet_kwargs).detach()

                x_start = noise_scheduler.predict_start_from_noise(
                    x_noisy, t=times,
                    noise=pred) if pred_objective == 'noise' else pred

                unet_kwargs = { ** unet_kwargs, 'self_cond': x_start}

        # get prediction

        pred = unet.forward(x_noisy, noise_cond, **unet_kwargs)

        # prediction objective

        if pred_objective == 'noise':
            target = noise
        elif pred_objective == 'x_start':
            target = x_start
        elif pred_objective == 'v':
            # derivation detailed in Appendix D of Progressive Distillation paper
            # https://arxiv.org/abs/2202.00512
            # this makes distillation viable as well as solve an issue with color shifting in upresoluting unets, noted in imagen-video
            target = alpha * noise - sigma * x_start
        else:
            raise ValueError(f'unknown objective {pred_objective}')

        return pred, target, log_snr, p2_loss_weight_gamma

    def forward(self,
                images,
                unet=None,
                texts=None,
                text_embeds=None,
                text_masks=None,
                unet_number=None,
                cond_images=None):
        if self.is_video and images.ndim == 4:
            images = rearrange(images, 'b c h w -> b c 1 h w')

        assert images.shape[-1] == images.shape[
            -2], f'the images you pass in must be a square, but received dimensions of {images.shape[2]}, {images.shape[-1]}'
        assert not (
            len(self.unets) > 1 and not exists(unet_number)
        ), f'you must specify which unet you want trained, from a range of 1 to {len(self.unets)}, if you are training cascading DDPM (multiple unets)'
        unet_number = default(unet_number, 1)
        assert not exists(
            self.only_train_unet_number
        ) or self.only_train_unet_number == unet_number, 'you can only train on unet #{self.only_train_unet_number}'

        images = cast_uint8_images_to_float(images)
        cond_images = maybe(cast_uint8_images_to_float)(cond_images)

        assert is_float_dtype(
            images.dtype
        ), f'images tensor needs to be floats but {images.dtype} dtype found instead'

        unet_index = unet_number - 1

        unet = default(unet, lambda: self.get_unet(unet_number))

        noise_scheduler = self.noise_schedulers[unet_index]
        p2_loss_weight_gamma = self.p2_loss_weight_gamma[unet_index]
        pred_objective = self.pred_objectives[unet_index]
        target_image_size = self.image_sizes[unet_index]
        random_crop_size = self.random_crop_sizes[unet_index]
        if self.is_sr:
            prev_image_size = self.image_sizes[unet_index - 1]
        else:
            prev_image_size = None
        b, c, h, w = images.shape

        assert images.shape[1] == self.channels
        assert h >= target_image_size and w >= target_image_size

        times = noise_scheduler.sample_random_times(b)

        if exists(texts) and not exists(
                text_embeds) and not self.unconditional:
            assert len(texts) == len(
                images
            ), 'number of text captions does not match up with the number of images given'
            with paddle.amp.auto_cast(enable=False):
                if 't5' in self.text_encoder_name:
                    text_embeds, text_masks = self.t5_encode_text(
                        t5=self.t5_encoder,
                        texts=texts,
                        tokenizer=self.tokenizer,
                        return_attn_mask=True)
                elif 'deberta' in self.text_encoder_name:
                    text_embeds, text_masks = self.debertav2_encode_text(
                        debertav2=self.debertav2_encoder,
                        texts=texts,
                        tokenizer=self.tokenizer,
                        return_attn_mask=True)
                else:
                    raise NotImplementedError(
                        "Please implement the text encoder.")

        if not self.unconditional:
            text_masks = default(
                text_masks, lambda: paddle.any(text_embeds != 0., axis=-1))

        assert not (
            self.condition_on_text and not exists(text_embeds)
        ), 'text or text encodings must be passed into decoder if specified'
        assert not (
            not self.condition_on_text and exists(text_embeds)
        ), 'decoder specified not to be conditioned on text, yet it is presented'

        assert not (
            exists(text_embeds) and
            text_embeds.shape[-1] != self.text_embed_dim
        ), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'

        lowres_cond_img = lowres_aug_times = None
        if exists(prev_image_size):
            lowres_cond_img = resize_image_to(images, prev_image_size)
            lowres_cond_img = resize_image_to(lowres_cond_img,
                                              target_image_size)

            if self.per_sample_random_aug_noise_level:
                lowres_aug_times = self.lowres_noise_schedule.sample_random_times(
                    b)
            else:
                lowres_aug_time = self.lowres_noise_schedule.sample_random_times(
                    1)
                lowres_aug_times = repeat(lowres_aug_time, '1 -> b', b=b)

        images = resize_image_to(images, target_image_size)

        return self.p_losses(
            unet,
            images,
            times,
            text_embeds=text_embeds,
            text_mask=text_masks,
            cond_images=cond_images,
            noise_scheduler=noise_scheduler,
            lowres_cond_img=lowres_cond_img,
            lowres_aug_times=lowres_aug_times,
            pred_objective=pred_objective,
            p2_loss_weight_gamma=p2_loss_weight_gamma,
            random_crop_size=random_crop_size)


def imagen_397M_text2im_64(**kwargs):
    use_recompute = kwargs.pop('use_recompute')
    recompute_granularity = kwargs.pop('recompute_granularity')
    model = ImagenModel(
        unets=Unet64_397M(use_recompute=use_recompute),
        image_sizes=(64, ),
        **kwargs)
    return model


def imagen_text2im_64(**kwargs):
    use_recompute = kwargs.pop('use_recompute')
    recompute_granularity = kwargs.pop('recompute_granularity')
    if 'lowres_cond' in kwargs:
        lowres_cond = kwargs.pop('lowres_cond')
    else:
        lowres_cond = False
    model = ImagenModel(
        unets=BaseUnet64(
            lowres_cond=lowres_cond, use_recompute=use_recompute),
        image_sizes=(64, ),
        **kwargs)
    return model


def imagen_text2im_64_debertav2(**kwargs):
    use_recompute = kwargs.pop('use_recompute')
    recompute_granularity = kwargs.pop('recompute_granularity')
    model = ImagenModel(
        unets=BaseUnet64(
            dim=360, use_recompute=use_recompute),
        image_sizes=(64, ),
        **kwargs)
    return model


def imagen_text2im_64_SR256(**kwargs):
    use_recompute = kwargs.pop('use_recompute')
    recompute_granularity = kwargs.pop('recompute_granularity')
    model = ImagenModel(
        unets=(BaseUnet64(use_recompute=use_recompute),
               SRUnet256(use_recompute=use_recompute)),
        image_sizes=(64, 256),
        **kwargs)
    return model


def imagen_SR256(**kwargs):
    use_recompute = kwargs.pop('use_recompute')
    recompute_granularity = kwargs.pop('recompute_granularity')
    if 'lowres_cond' in kwargs:
        lowres_cond = kwargs.pop('lowres_cond')
    else:
        lowres_cond = False
    model = ImagenModel(
        unets=SRUnet256(
            lowres_cond=lowres_cond, use_recompute=use_recompute),
        image_sizes=(256, 64),
        **kwargs)
    return model


def imagen_SR1024(**kwargs):
    use_recompute = kwargs.pop('use_recompute')
    recompute_granularity = kwargs.pop('recompute_granularity')
    if 'lowres_cond' in kwargs:
        lowres_cond = kwargs.pop('lowres_cond')
    else:
        lowres_cond = False
    model = ImagenModel(
        unets=SRUnet1024(
            dim=128, lowres_cond=lowres_cond, use_recompute=use_recompute),
        image_sizes=(1024, 256),
        **kwargs)
    return model


================================================
FILE: ppfleetx/models/multimodal_model/imagen/unet.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from pathlib import Path
from functools import partial

import paddle
from paddle import nn
from paddle import nn, einsum
import paddle.nn.functional as F
from paddle.distributed.fleet.utils import recompute

from .utils import (zeros_, zero_init_, default, exists, cast_tuple, l2norm,
                    resize_image_to, prob_mask_like, masked_mean, Identity,
                    repeat, repeat_many, Rearrange, rearrange, rearrange_many,
                    EinopsToAndFrom, Parallel, Always, print_once)

from ppfleetx.models.language_model.t5.modeling import finfo


class LayerNorm(nn.Layer):
    def __init__(self, feats, stable=False, dim=-1):
        super().__init__()
        self.stable = stable
        self.dim = dim

        self.g = self.create_parameter(
            [feats, *((1, ) * (-dim - 1))],
            default_initializer=nn.initializer.Constant(value=1.))

    def forward(self, x):
        dtype, dim = x.dtype, self.dim

        if self.stable:
            x = x / x.amax(axis=dim, keepdim=True).detach()

        eps = 1e-5 if x.dtype == paddle.float32 else 1e-3
        var = paddle.var(x, axis=dim, unbiased=False, keepdim=True)
        mean = paddle.mean(x, axis=dim, keepdim=True)

        return (x - mean) * (
            var + eps).rsqrt().cast(dtype) * self.g.cast(dtype)


ChanLayerNorm = partial(LayerNorm, dim=-3)


class Residual(nn.Layer):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x


# attention pooling


class PerceiverAttention(nn.Layer):
    def __init__(self, *, dim, dim_head=64, heads=8, cosine_sim_attn=False):
        super().__init__()
        self.scale = dim_head**-0.5 if not cosine_sim_attn else 1
        self.cosine_sim_attn = cosine_sim_attn
        self.cosine_sim_scale = 16 if cosine_sim_attn else 1

        self.heads = heads
        inner_dim = dim_head * heads

        self.norm = nn.LayerNorm(dim)
        self.norm_latents = nn.LayerNorm(dim)

        self.to_q = nn.Linear(dim, inner_dim, bias_attr=False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias_attr=False)

        self.to_out = nn.Sequential(
            nn.Linear(
                inner_dim, dim, bias_attr=False), nn.LayerNorm(dim))

    def forward(self, x, latents, mask=None):
        x = self.norm(x)
        latents = self.norm_latents(latents)

        b, h = x.shape[0], self.heads

        q = self.to_q(latents)

        # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to
        kv_input = paddle.concat((x, latents), axis=-2)
        k, v = self.to_kv(kv_input).chunk(2, axis=-1)

        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h=h)

        q = q * self.scale

        # cosine sim attention

        if self.cosine_sim_attn:
            q, k = map(l2norm, (q, k))

        # similarities and masking

        sim = einsum('... i d, ... j d  -> ... i j', q,
                     k) * self.cosine_sim_scale

        if exists(mask):
            max_neg_value = -finfo(sim.dtype).max
            mask = F.pad(mask, (0, latents.shape[-2]), value=True)
            mask = rearrange(mask, 'b j -> b 1 1 j')
            sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim)

        # attention

        attn = F.softmax(sim, axis=-1, dtype=paddle.float32)
        attn = attn.cast(sim.dtype)

        out = einsum('... i j, ... j d -> ... i d', attn, v)
        B, H, N, D = out.shape
        out = out.transpose([0, 2, 1, 3]).reshape([B, N, -1])
        return self.to_out(out)


class PerceiverResampler(nn.Layer):
    def __init__(
            self,
            *,
            dim,
            depth,
            dim_head=64,
            heads=8,
            num_latents=64,
            num_latents_mean_pooled=4,  # number of latents derived from mean pooled representation of the sequence
            max_seq_len=512,
            ff_mult=4,
            cosine_sim_attn=False):
        super().__init__()
        self.pos_emb = nn.Embedding(max_seq_len, dim)

        self.latents = self.create_parameter(
            [num_latents, dim], default_initializer=nn.initializer.Normal())

        self.to_latents_from_mean_pooled_seq = None

        if num_latents_mean_pooled > 0:
            self.to_latents_from_mean_pooled_seq = nn.Sequential(
                LayerNorm(dim),
                nn.Linear(dim, dim * num_latents_mean_pooled),
                Rearrange(
                    'b (n d) -> b n d', n=num_latents_mean_pooled))

        self.layers = nn.LayerList([])
        for _ in range(depth):
            self.layers.append(
                nn.LayerList([
                    PerceiverAttention(
                        dim=dim,
                        dim_head=dim_head,
                        heads=heads,
                        cosine_sim_attn=cosine_sim_attn), FeedForward(
                            dim=dim, mult=ff_mult)
                ]))

    def forward(self, x, mask=None):
        n = x.shape[1]
        pos_emb = self.pos_emb(paddle.arange(n))

        x_with_pos = x + pos_emb

        latents = repeat(self.latents, 'n d -> b n d', b=x.shape[0])

        if exists(self.to_latents_from_mean_pooled_seq):
            meanpooled_seq = masked_mean(
                x, axis=1, mask=paddle.ones(
                    x.shape[:2], dtype=paddle.bool))
            meanpooled_latents = self.to_latents_from_mean_pooled_seq(
                meanpooled_seq)
            latents = paddle.concat((meanpooled_latents, latents), axis=-2)

        for attn, ff in self.layers:
            latents = attn(x_with_pos, latents, mask=mask) + latents
            latents = ff(latents) + latents

        return latents


# attention


class Attention(nn.Layer):
    def __init__(
            self,
            dim,
            *,
            dim_head=64,
            heads=8,
            context_dim=None,
            cosine_sim_attn=False,
            use_recompute=False, ):
        super().__init__()
        self.use_recompute = use_recompute
        self.scale = dim_head**-0.5 if not cosine_sim_attn else 1.
        self.cosine_sim_attn = cosine_sim_attn
        self.cosine_sim_scale = 16 if cosine_sim_attn else 1

        self.heads = heads
        inner_dim = dim_head * heads

        self.norm = LayerNorm(dim)

        self.null_kv = self.create_parameter(
            [2, dim_head], default_initializer=nn.initializer.Normal())
        self.to_q = nn.Linear(dim, inner_dim, bias_attr=False)
        self.to_kv = nn.Linear(dim, dim_head * 2, bias_attr=False)

        self.to_context = nn.Sequential(
            nn.LayerNorm(context_dim), nn.Linear(
                context_dim, dim_head * 2)) if exists(context_dim) else None

        self.to_out = nn.Sequential(
            nn.Linear(
                inner_dim, dim, bias_attr=False), LayerNorm(dim))

    def forward(self, x, context=None, mask=None, attn_bias=None):
        if self.use_recompute:
            return recompute(self._forward, x, context, mask, attn_bias)
        else:
            return self._forward(x, context, mask, attn_bias)

    def _forward(self, x, context=None, mask=None, attn_bias=None):
        b, n = x.shape[:2]

        x = self.norm(x)

        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, axis=-1))

        q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads)

        q = q * self.scale

        # add null key / value for classifier free guidance in prior net

        nk, nv = repeat_many(self.null_kv.unbind(axis=-2), 'd -> b 1 d', b=b)
        k = paddle.concat((nk, k), axis=-2)
        v = paddle.concat((nv, v), axis=-2)

        # add text conditioning, if present

        if exists(context):
            assert exists(self.to_context)
            ck, cv = self.to_context(context).chunk(2, axis=-1)
            k = paddle.concat((ck, k), axis=-2)
            v = paddle.concat((cv, v), axis=-2)

        # cosine sim attention

        if self.cosine_sim_attn:
            q, k = map(l2norm, (q, k))

        # calculate query / key similarities

        sim = einsum('b h i d, b j d -> b h i j', q, k) * self.cosine_sim_scale

        # relative positional encoding (T5 style)

        if exists(attn_bias):
            sim = sim + attn_bias

        # masking

        max_neg_value = -finfo(sim.dtype).max

        if exists(mask):
            mask = F.pad(mask, (1, 0), value=True)
            mask = rearrange(mask, 'b j -> b 1 1 j')
            sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim)

        # attention

        attn = F.softmax(sim, axis=-1, dtype=paddle.float32)

        # aggregate values

        out = einsum('b h i j, b j d -> b h i d', attn, v)

        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)


# decoder


def Upsample(dim, dim_out=None):
    dim_out = default(dim_out, dim)

    return nn.Sequential(
        nn.Upsample(
            scale_factor=2, mode='nearest'),
        nn.Conv2D(
            dim, dim_out, 3, padding=1))


class PixelShuffleUpsample(nn.Layer):
    """
    code shared by @MalumaDev at DALLE2 for addressing checkboard artifacts
    https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf
    """

    def __init__(self, dim, dim_out=None):
        super().__init__()
        dim_out = default(dim_out, dim)
        conv = nn.Conv2D(dim, dim_out * 4, 1)

        self.net = nn.Sequential(conv, nn.Silu(), nn.PixelShuffle(2))

        self.init_conv_(conv)

    def init_conv_(self, conv):
        o, i, h, w = conv.weight.shape
        conv_weight = paddle.empty([o // 4, i, h, w])
        nn.initializer.KaimingUniform(conv_weight)
        conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')

        conv.weight.set_value(conv_weight)
        zeros_(conv.bias)

    def forward(self, x):
        return self.net(x)


def Downsample(dim, dim_out=None):
    dim_out = default(dim_out, dim)
    return nn.Sequential(
        Rearrange(
            'b c (h s1) (w s2) -> b (c s1 s2) h w', s1=2, s2=2),
        nn.Conv2D(dim * 4, dim_out, 1))


class SinusoidalPosEmb(nn.Layer):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = paddle.exp(paddle.arange(half_dim) * -emb)
        emb = x[:, None] * emb[None, :]
        return paddle.concat((emb.sin(), emb.cos()), axis=-1)


class LearnedSinusoidalPosEmb(nn.Layer):
    """ following @crowsonkb 's lead with learned sinusoidal pos emb """
    """ https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """

    def __init__(self, dim):
        super().__init__()
        assert (dim % 2) == 0
        half_dim = dim // 2
        self.weights = self.create_parameter(
            [half_dim], default_initializer=nn.initializer.Normal())

    def forward(self, x):
        x = x[:, None]
        freqs = x * self.weights[None, :] * 2 * math.pi
        fouriered = paddle.concat((freqs.sin(), freqs.cos()), axis=-1)
        fouriered = paddle.concat((x, fouriered), axis=-1)
        return fouriered


class Block(nn.Layer):
    def __init__(self, dim, dim_out, groups=8, norm=True):
        super().__init__()
        self.groupnorm = nn.GroupNorm(groups, dim) if norm else Identity()
        self.activation = nn.Silu()
        self.project = nn.Conv2D(dim, dim_out, 3, padding=1)

    def forward(self, x, scale_shift=None):
        x = self.groupnorm(x)

        if exists(scale_shift):
            scale, shift = scale_shift
            x = x * (scale + 1) + shift

        x = self.activation(x)
        return self.project(x)


class ResnetBlock(nn.Layer):
    def __init__(self,
                 dim,
                 dim_out,
                 *,
                 cond_dim=None,
                 time_cond_dim=None,
                 groups=8,
                 linear_attn=False,
                 use_gca=False,
                 squeeze_excite=False,
                 use_recompute=False,
                 **attn_kwargs):
        super().__init__()

        self.time_mlp = None
        self.use_recompute = use_recompute

        if exists(time_cond_dim):
            self.time_mlp = nn.Sequential(
                nn.Silu(), nn.Linear(time_cond_dim, dim_out * 2))

        self.cross_attn = None

        if exists(cond_dim):
            attn_klass = CrossAttention if not linear_attn else LinearCrossAttention

            self.cross_attn = attn_klass(
                dim=dim_out, context_dim=cond_dim, **attn_kwargs)

        self.block1 = Block(dim, dim_out, groups=groups)
        self.block2 = Block(dim_out, dim_out, groups=groups)

        self.gca = GlobalContext(
            dim_in=dim_out, dim_out=dim_out) if use_gca else Always(1)

        self.res_conv = nn.Conv2D(dim, dim_out,
                                  1) if dim != dim_out else Identity()

    def forward(self, x, time_emb=None, cond=None):
        scale_shift = None
        if exists(self.time_mlp) and exists(time_emb):
            time_emb = self.time_mlp(time_emb)
            time_emb = time_emb[:, :, None, None]
            scale_shift = time_emb.chunk(2, axis=1)

        h = self.block1(x)

        if exists(self.cross_attn):
            assert exists(cond)
            h = h.transpose([0, 2, 3, 1])
            n, b, c, *_ = h.shape
            h = h.reshape([n, b * c, -1])
            h = self.cross_attn(h, context=cond) + h
            h = h.reshape([n, b, c, -1])
            h = h.transpose([0, 3, 1, 2])

        h = self.block2(h, scale_shift=scale_shift)

        h = h * self.gca(h)

        return h + self.res_conv(x)


class CrossAttention(nn.Layer):
    def __init__(self,
                 dim,
                 *,
                 context_dim=None,
                 dim_head=64,
                 heads=8,
                 norm_context=False,
                 cosine_sim_attn=False):
        super().__init__()
        self.scale = dim_head**-0.5 if not cosine_sim_attn else 1.
        self.cosine_sim_attn = cosine_sim_attn
        self.cosine_sim_scale = 16 if cosine_sim_attn else 1

        self.heads = heads
        inner_dim = dim_head * heads

        context_dim = default(context_dim, dim)

        self.norm = LayerNorm(dim)
        self.norm_context = LayerNorm(
            context_dim) if norm_context else Identity()

        self.null_kv = self.create_parameter(
            [2, dim_head], default_initializer=nn.initializer.Normal())
        self.to_q = nn.Linear(dim, inner_dim, bias_attr=False)
        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias_attr=False)

        self.to_out = nn.Sequential(
            nn.Linear(
                inner_dim, dim, bias_attr=False), LayerNorm(dim))

    def forward(self, x, context, mask=None):
        b, n = x.shape[:2]

        x = self.norm(x)
        context = self.norm_context(context)

        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, axis=-1))

        q, k, v = rearrange_many(
            (q, k, v), 'b n (h d) -> b h n d', h=self.heads)

        # add null key / value for classifier free guidance in prior net

        nk, nv = repeat_many(
            self.null_kv.unbind(axis=-2), 'd -> b h 1 d', h=self.heads, b=b)

        k = paddle.concat((nk, k), axis=-2)
        v = paddle.concat((nv, v), axis=-2)

        q = q * self.scale

        # cosine sim attention

        if self.cosine_sim_attn:
            q, k = map(l2norm, (q, k))

        # similarities

        sim = einsum('b h i d, b h j d -> b h i j', q,
                     k) * self.cosine_sim_scale

        # masking

        max_neg_value = -finfo(sim.dtype).max

        if exists(mask):
            mask = F.pad(mask, (1, 0), value=True)
            mask = rearrange(mask, 'b j -> b 1 1 j')
            sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim)

        attn = F.softmax(sim, axis=-1, dtype=paddle.float32)
        attn = attn.cast(sim.dtype)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)


class LinearCrossAttention(CrossAttention):
    def forward(self, x, context, mask=None):
        b, n = x.shape[:2]

        x = self.norm(x)
        context = self.norm_context(context)

        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, axis=-1))

        q, k, v = rearrange_many(
            (q, k, v), 'b n (h d) -> (b h) n d', h=self.heads)

        # add null key / value for classifier free guidance in prior net

        nk, nv = repeat_many(
            self.null_kv.unbind(axis=-2), 'd -> (b h) 1 d', h=self.heads, b=b)

        k = paddle.concat((nk, k), axis=-2)
        v = paddle.concat((nv, v), axis=-2)

        # masking

        max_neg_value = -finfo(x.dtype).max

        if exists(mask):
            mask = F.pad(mask, (1, 0), value=True)
            mask = rearrange(mask, 'b n -> b n 1')
            k = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), k)
            v = paddle.where(mask == 0, paddle.to_tensor(0.), v)

        # linear attention

        q = F.softmax(q, axis=-1)
        k = F.softmax(k, axis=-2)

        q = q * self.scale

        context = einsum('b n d, b n e -> b d e', k, v)
        out = einsum('b n d, b d e -> b n e', q, context)
        out = rearrange(out, '(b h) n d -> b n (h d)', h=self.heads)
        return self.to_out(out)


class LinearAttention(nn.Layer):
    def __init__(self,
                 dim,
                 dim_head=32,
                 heads=8,
                 dropout=0.05,
                 context_dim=None,
                 **kwargs):
        super().__init__()
        self.scale = dim_head**-0.5
        self.heads = heads
        inner_dim = dim_head * heads
        self.norm = ChanLayerNorm(dim)

        self.nonlin = nn.Silu()

        self.to_q = nn.Sequential(
            nn.Dropout(dropout),
            nn.Conv2D(
                dim, inner_dim, 1, bias_attr=False),
            nn.Conv2D(
                inner_dim,
                inner_dim,
                3,
                bias_attr=False,
                padding=1,
                groups=inner_dim))

        self.to_k = nn.Sequential(
            nn.Dropout(dropout),
            nn.Conv2D(
                dim, inner_dim, 1, bias_attr=False),
            nn.Conv2D(
                inner_dim,
                inner_dim,
                3,
                bias_attr=False,
                padding=1,
                groups=inner_dim))

        self.to_v = nn.Sequential(
            nn.Dropout(dropout),
            nn.Conv2D(
                dim, inner_dim, 1, bias_attr=False),
            nn.Conv2D(
                inner_dim,
                inner_dim,
                3,
                bias_attr=False,
                padding=1,
                groups=inner_dim))

        self.to_context = nn.Sequential(
            nn.LayerNorm(context_dim),
            nn.Linear(
                context_dim, inner_dim * 2,
                bias_attr=False)) if exists(context_dim) else None

        self.to_out = nn.Sequential(
            nn.Conv2D(
                inner_dim, dim, 1, bias_attr=False), ChanLayerNorm(dim))

    def forward(self, fmap, context=None):
        h, x, y = self.heads, *fmap.shape[-2:]

        fmap = self.norm(fmap)
        q, k, v = map(lambda fn: fn(fmap), (self.to_q, self.to_k, self.to_v))
        q, k, v = rearrange_many(
            (q, k, v), 'b (h c) x y -> (b h) (x y) c', h=h)

        if exists(context):
            assert exists(self.to_context)
            ck, cv = self.to_context(context).chunk(2, axis=-1)
            ck, cv = rearrange_many((ck, cv), 'b n (h d) -> (b h) n d', h=h)
            k = paddle.concat((k, ck), axis=-2)
            v = paddle.concat((v, cv), axis=-2)

        q = F.softmax(q, axis=-1)
        k = F.softmax(k, axis=-2)

        q = q * self.scale

        context = einsum('b n d, b n e -> b d e', k, v)
        out = einsum('b n d, b d e -> b n e', q, context)
        out = rearrange(out, '(b h) (x y) d -> b (h d) x y', h=h, x=x, y=y)

        out = self.nonlin(out)
        return self.to_out(out)


class GlobalContext(nn.Layer):
    """ basically a superior form of squeeze-excitation that is attention-esque """

    def __init__(self, *, dim_in, dim_out):
        super().__init__()
        self.to_k = nn.Conv2D(dim_in, 1, 1)
        hidden_dim = max(3, dim_out // 2)

        self.net = nn.Sequential(
            nn.Conv2D(dim_in, hidden_dim, 1),
            nn.Silu(), nn.Conv2D(hidden_dim, dim_out, 1), nn.Sigmoid())

    def forward(self, x):
        context = self.to_k(x)
        x, context = rearrange_many((x, context), 'b n ... -> b n (...)')
        out = einsum('b i n, b c n -> b c i', F.softmax(context, axis=-1), x)
        out = out[:, :, :, None]
        return self.net(out)


def FeedForward(dim, mult=2):
    hidden_dim = int(dim * mult)
    return nn.Sequential(
        LayerNorm(dim),
        nn.Linear(
            dim, hidden_dim, bias_attr=False),
        nn.GELU(),
        LayerNorm(hidden_dim),
        nn.Linear(
            hidden_dim, dim, bias_attr=False))


def ChanFeedForward(
        dim, mult=2
):  # in paper, it seems for self attention layers they did feedforwards with twice channel width
    hidden_dim = int(dim * mult)
    return nn.Sequential(
        ChanLayerNorm(dim),
        nn.Conv2D(
            dim, hidden_dim, 1, bias_attr=False),
        nn.GELU(),
        ChanLayerNorm(hidden_dim),
        nn.Conv2D(
            hidden_dim, dim, 1, bias_attr=False))


class TransformerBlock(nn.Layer):
    def __init__(
            self,
            dim,
            *,
            depth=1,
            heads=8,
            dim_head=32,
            ff_mult=2,
            context_dim=None,
            cosine_sim_attn=False,
            use_recompute=False, ):
        super().__init__()
        self.layers = nn.LayerList([])

        for _ in range(depth):
            self.layers.append(
                nn.LayerList([
                    Attention(
                        dim=dim,
                        heads=heads,
                        dim_head=dim_head,
                        context_dim=context_dim,
                        cosine_sim_attn=cosine_sim_attn,
                        use_recompute=use_recompute), FeedForward(
                            dim=dim, mult=ff_mult)
                ]))

    def forward(self, x, context=None):
        x = x.transpose([0, 2, 3, 1])
        n, b, c, *_ = x.shape
        x = x.reshape([n, b * c, -1])
        for attn, ff in self.layers:
            x = attn(x, context=context) + x
            x = ff(x) + x
        x = x.reshape([n, b, c, -1])
        x = x.transpose([0, 3, 1, 2])
        return x


class LinearAttentionTransformerBlock(nn.Layer):
    def __init__(self,
                 dim,
                 *,
                 depth=1,
                 heads=8,
                 dim_head=32,
                 ff_mult=2,
                 context_dim=None,
                 **kwargs):
        super().__init__()
        self.layers = nn.LayerList([])

        for _ in range(depth):
            self.layers.append(
                nn.LayerList([
                    LinearAttention(
                        dim=dim,
                        heads=heads,
                        dim_head=dim_head,
                        context_dim=context_dim), ChanFeedForward(
                            dim=dim, mult=ff_mult)
                ]))

    def forward(self, x, context=None):
        for attn, ff in self.layers:
            x = attn(x, context=context) + x
            x = ff(x) + x
        return x


class CrossEmbedLayer(nn.Layer):
    def __init__(self, dim_in, kernel_sizes, dim_out=None, stride=2):
        super().__init__()
        assert all([*map(lambda t: (t % 2) == (stride % 2), kernel_sizes)])
        dim_out = default(dim_out, dim_in)

        kernel_sizes = sorted(kernel_sizes)
        num_scales = len(kernel_sizes)

        # calculate the dimension at each scale
        dim_scales = [int(dim_out / (2**i)) for i in range(1, num_scales)]
        dim_scales = [*dim_scales, dim_out - sum(dim_scales)]

        self.convs = nn.LayerList([])
        for kernel, dim_scale in zip(kernel_sizes, dim_scales):
            self.convs.append(
                nn.Conv2D(
                    dim_in,
                    dim_scale,
                    kernel,
                    stride=stride,
                    padding=(kernel - stride) // 2))

    def forward(self, x):
        fmaps = tuple(map(lambda conv: conv(x), self.convs))
        return paddle.concat(fmaps, axis=1)


class UpsampleCombiner(nn.Layer):
    def __init__(self,
                 dim,
                 *,
                 enabled=False,
                 dim_ins=tuple(),
                 dim_outs=tuple()):
        super().__init__()
        dim_outs = cast_tuple(dim_outs, len(dim_ins))
        assert len(dim_ins) == len(dim_outs)

        self.enabled = enabled

        if not self.enabled:
            self.dim_out = dim
            return

        self.fmap_convs = nn.LayerList([
            Block(dim_in, dim_out)
            for dim_in, dim_out in zip(dim_ins, dim_outs)
        ])
        self.dim_out = dim + (sum(dim_outs) if len(dim_outs) > 0 else 0)

    def forward(self, x, fmaps=None):
        target_size = x.shape[-1]

        fmaps = default(fmaps, tuple())

        if not self.enabled or len(fmaps) == 0 or len(self.fmap_convs) == 0:
            return x

        fmaps = [resize_image_to(fmap, target_size) for fmap in fmaps]
        outs = [conv(fmap) for fmap, conv in zip(fmaps, self.fmap_convs)]
        return paddle.concat((x, *outs), axis=1)


class Unet(nn.Layer):
    def __init__(self,
                 *,
                 dim,
                 image_embed_dim=1024,
                 text_embed_dim=1024,
                 num_resnet_blocks=1,
                 cond_dim=None,
                 num_image_tokens=4,
                 num_time_tokens=2,
                 learned_sinu_pos_emb_dim=16,
                 out_dim=None,
                 dim_mults=(1, 2, 4, 8),
                 cond_images_channels=0,
                 channels=3,
                 channels_out=None,
                 attn_dim_head=64,
                 attn_heads=8,
                 ff_mult=2.,
                 lowres_cond=False,
                 layer_attns=True,
                 layer_attns_depth=1,
                 layer_mid_attns_depth=1,
                 layer_attns_add_text_cond=True,
                 attend_at_middle=True,
                 layer_cross_attns=True,
                 use_linear_attn=False,
                 use_linear_cross_attn=False,
                 cond_on_text=True,
                 max_text_len=256,
                 init_dim=None,
                 resnet_groups=8,
                 init_conv_kernel_size=7,
                 init_cross_embed=True,
                 init_cross_embed_kernel_sizes=(3, 7, 15),
                 cross_embed_downsample=False,
                 cross_embed_downsample_kernel_sizes=(2, 4),
                 attn_pool_text=True,
                 attn_pool_num_latents=32,
                 dropout=0.,
                 memory_efficient=False,
                 init_conv_to_final_conv_residual=False,
                 use_global_context_attn=True,
                 scale_skip_connection=True,
                 final_resnet_block=True,
                 final_conv_kernel_size=3,
                 cosine_sim_attn=False,
                 self_cond=False,
                 combine_upsample_fmaps=False,
                 pixel_shuffle_upsample=True,
                 use_recompute=False):
        super().__init__()

        self.use_recompute = use_recompute
        # guide researchers

        assert attn_heads > 1, 'you need to have more than 1 attention head, ideally at least 4 or 8'

        if dim < 128:
            print_once(
                'The base dimension of your u-net should ideally be no smaller than 128, as recommended by a professional DDPM trainer https://nonint.com/2022/05/04/friends-dont-let-friends-train-small-diffusion-models/'
            )

        # save locals to take care of some hyperparameters for cascading DDPM

        self._locals = locals()
        self._locals.pop('self', None)
        self._locals.pop('__class__', None)

        # determine dimensions

        self.channels = channels
        self.channels_out = default(channels_out, channels)

        init_channels = channels * (1 + int(lowres_cond) + int(self_cond))
        init_dim = default(init_dim, dim)

        self.self_cond = self_cond

        # optional image conditioning

        self.has_cond_image = cond_images_channels > 0
        self.cond_images_channels = cond_images_channels

        init_channels += cond_images_channels

        # initial convolution

        self.init_conv = CrossEmbedLayer(
            init_channels,
            dim_out=init_dim,
            kernel_sizes=init_cross_embed_kernel_sizes,
            stride=1) if init_cross_embed else nn.Conv2D(
                init_channels,
                init_dim,
                init_conv_kernel_size,
                padding=init_conv_kernel_size // 2)

        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
        in_out = list(zip(dims[:-1], dims[1:]))

        # time conditioning

        cond_dim = default(cond_dim, dim)
        time_cond_dim = dim * 4 * (2 if lowres_cond else 1)

        # embedding time for log(snr) noise from continuous version

        sinu_pos_emb = LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim)
        sinu_pos_emb_input_dim = learned_sinu_pos_emb_dim + 1

        self.to_time_hiddens = nn.Sequential(
            sinu_pos_emb,
            nn.Linear(sinu_pos_emb_input_dim, time_cond_dim), nn.Silu())

        self.to_time_cond = nn.Sequential(
            nn.Linear(time_cond_dim, time_cond_dim))

        # project to time tokens as well as time hiddens

        self.to_time_tokens = nn.Sequential(
            nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
            Rearrange(
                'b (n d) -> b n d', n=num_time_tokens))

        # low res aug noise conditioning

        self.lowres_cond = lowres_cond

        if lowres_cond:
            self.to_lowres_time_hiddens = nn.Sequential(
                LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim),
                nn.Linear(learned_sinu_pos_emb_dim + 1, time_cond_dim),
                nn.Silu())

            self.to_lowres_time_cond = nn.Sequential(
                nn.Linear(time_cond_dim, time_cond_dim))

            self.to_lowres_time_tokens = nn.Sequential(
                nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
                Rearrange(
                    'b (n d) -> b n d', n=num_time_tokens))

        # normalizations

        self.norm_cond = nn.LayerNorm(cond_dim)

        # text encoding conditioning (optional)

        self.text_to_cond = None

        if cond_on_text:
            assert exists(
                text_embed_dim
            ), 'text_embed_dim must be given to the unet if cond_on_text is True'
            self.text_to_cond = nn.Linear(text_embed_dim, cond_dim)

        # finer control over whether to condition on text encodings

        self.cond_on_text = cond_on_text

        # attention pooling

        self.attn_pool = PerceiverResampler(
            dim=cond_dim,
            depth=2,
            dim_head=attn_dim_head,
            heads=attn_heads,
            num_latents=attn_pool_num_latents,
            cosine_sim_attn=cosine_sim_attn) if attn_pool_text else None

        # for classifier free guidance

        self.max_text_len = max_text_len

        self.null_text_embed = self.create_parameter(
            [1, max_text_len, cond_dim],
            default_initializer=nn.initializer.Normal())
        self.null_text_hidden = self.create_parameter(
            [1, time_cond_dim], default_initializer=nn.initializer.Normal())

        # for non-attention based text conditioning at all points in the network where time is also conditioned

        self.to_text_non_attn_cond = None

        if cond_on_text:
            self.to_text_non_attn_cond = nn.Sequential(
                nn.LayerNorm(cond_dim),
                nn.Linear(cond_dim, time_cond_dim),
                nn.Silu(), nn.Linear(time_cond_dim, time_cond_dim))

        # attention related params

        attn_kwargs = dict(
            heads=attn_heads,
            dim_head=attn_dim_head,
            cosine_sim_attn=cosine_sim_attn,
            use_recompute=use_recompute)

        num_layers = len(in_out)

        # resnet block klass

        num_resnet_blocks = cast_tuple(num_resnet_blocks, num_layers)
        resnet_groups = cast_tuple(resnet_groups, num_layers)

        resnet_klass = partial(ResnetBlock, **attn_kwargs)

        layer_attns = cast_tuple(layer_attns, num_layers)
        layer_attns_depth = cast_tuple(layer_attns_depth, num_layers)
        layer_cross_attns = cast_tuple(layer_cross_attns, num_layers)

        use_linear_attn = cast_tuple(use_linear_attn, num_layers)
        use_linear_cross_attn = cast_tuple(use_linear_cross_attn, num_layers)

        assert all([
            layers == num_layers
            for layers in list(
                map(len, (resnet_groups, layer_attns, layer_cross_attns)))
        ])

        # downsample klass

        downsample_klass = Downsample

        if cross_embed_downsample:
            downsample_klass = partial(
                CrossEmbedLayer,
                kernel_sizes=cross_embed_downsample_kernel_sizes)

        # initial resnet block (for memory efficient unet)

        self.init_resnet_block = resnet_klass(
            init_dim,
            init_dim,
            time_cond_dim=time_cond_dim,
            groups=resnet_groups[0],
            use_gca=use_global_context_attn) if memory_efficient else None

        # scale for resnet skip connections

        self.skip_connect_scale = 1. if not scale_skip_connection else (2
                                                                        **-0.5)

        # layers

        self.downs = nn.LayerList([])
        self.ups = nn.LayerList([])
        num_resolutions = len(in_out)

        layer_params = [
            num_resnet_blocks, resnet_groups, layer_attns, layer_attns_depth,
            layer_cross_attns, use_linear_attn, use_linear_cross_attn
        ]
        reversed_layer_params = list(map(reversed, layer_params))

        # downsampling layers

        skip_connect_dims = []  # keep track of skip connection dimensions

        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups,
                  layer_attn, layer_attn_depth, layer_cross_attn,
                  layer_use_linear_attn, layer_use_linear_cross_attn
                  ) in enumerate(zip(in_out, *layer_params)):
            is_last = ind >= (num_resolutions - 1)

            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None

            if layer_attn:
                transformer_block_klass = TransformerBlock
            elif layer_use_linear_attn:
                transformer_block_klass = LinearAttentionTransformerBlock
            else:
                transformer_block_klass = Identity

            current_dim = dim_in

            # whether to pre-downsample, from memory efficient unet

            pre_downsample = None

            if memory_efficient:
                pre_downsample = downsample_klass(dim_in, dim_out)
                current_dim = dim_out

            skip_connect_dims.append(current_dim)

            # whether to do post-downsample, for non-memory efficient unet

            post_downsample = None
            if not memory_efficient:
                post_downsample = downsample_klass(
                    current_dim, dim_out) if not is_last else Parallel(
                        nn.Conv2D(
                            dim_in, dim_out, 3, padding=1),
                        nn.Conv2D(dim_in, dim_out, 1))

            self.downs.append(
                nn.LayerList([
                    pre_downsample, resnet_klass(
                        current_dim,
                        current_dim,
                        cond_dim=layer_cond_dim,
                        linear_attn=layer_use_linear_cross_attn,
                        time_cond_dim=time_cond_dim,
                        groups=groups,
                        use_recompute=use_recompute), nn.LayerList([
                            ResnetBlock(
                                current_dim,
                                current_dim,
                                time_cond_dim=time_cond_dim,
                                groups=groups,
                                use_gca=use_global_context_attn,
                                use_recompute=use_recompute)
                            for _ in range(layer_num_resnet_blocks)
                        ]), transformer_block_klass(
                            dim=current_dim,
                            depth=layer_attn_depth,
                            ff_mult=ff_mult,
                            context_dim=cond_dim,
                            **attn_kwargs), post_downsample
                ]))

        # middle layers

        mid_dim = dims[-1]

        self.mid_block1 = ResnetBlock(
            mid_dim,
            mid_dim,
            cond_dim=cond_dim,
            time_cond_dim=time_cond_dim,
            groups=resnet_groups[-1],
            use_recompute=use_recompute)
        self.mid_attn = TransformerBlock(
            mid_dim, depth=layer_mid_attns_depth,
            **attn_kwargs) if attend_at_middle else None
        self.mid_block2 = ResnetBlock(
            mid_dim,
            mid_dim,
            cond_dim=cond_dim,
            time_cond_dim=time_cond_dim,
            groups=resnet_groups[-1],
            use_recompute=use_recompute)

        # upsample klass

        upsample_klass = Upsample if not pixel_shuffle_upsample else PixelShuffleUpsample

        # upsampling layers

        upsample_fmap_dims = []

        for ind, (
            (dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn,
                layer_attn_depth, layer_cross_attn, layer_use_linear_attn,
                layer_use_linear_cross_attn
        ) in enumerate(zip(reversed(in_out), *reversed_layer_params)):
            is_last = ind == (len(in_out) - 1)

            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None

            if layer_attn:
                transformer_block_klass = TransformerBlock
            elif layer_use_linear_attn:
                transformer_block_klass = LinearAttentionTransformerBlock
            else:
                transformer_block_klass = Identity

            skip_connect_dim = skip_connect_dims.pop()

            upsample_fmap_dims.append(dim_out)

            self.ups.append(
                nn.LayerList([
                    resnet_klass(
                        dim_out + skip_connect_dim,
                        dim_out,
                        cond_dim=layer_cond_dim,
                        linear_attn=layer_use_linear_cross_attn,
                        time_cond_dim=time_cond_dim,
                        groups=groups,
                        use_recompute=use_recompute), nn.LayerList([
                            ResnetBlock(
                                dim_out + skip_connect_dim,
                                dim_out,
                                time_cond_dim=time_cond_dim,
                                groups=groups,
                                use_gca=use_global_context_attn,
                                use_recompute=use_recompute)
                            for _ in range(layer_num_resnet_blocks)
                        ]), transformer_block_klass(
                            dim=dim_out,
                            depth=layer_attn_depth,
                            ff_mult=ff_mult,
                            context_dim=cond_dim,
                            **attn_kwargs), upsample_klass(dim_out, dim_in)
                    if not is_last or memory_efficient else Identity()
                ]))

        # whether to combine feature maps from all upsample blocks before final resnet block out

        self.upsample_combiner = UpsampleCombiner(
            dim=dim,
            enabled=combine_upsample_fmaps,
            dim_ins=upsample_fmap_dims,
            dim_outs=dim)

        # whether to do a final residual from initial conv to the final resnet block out

        self.init_conv_to_final_conv_residual = init_conv_to_final_conv_residual
        final_conv_dim = self.upsample_combiner.dim_out + (
            dim if init_conv_to_final_conv_residual else 0)

        # final optional resnet block and convolution out

        self.final_res_block = ResnetBlock(
            final_conv_dim,
            dim,
            time_cond_dim=time_cond_dim,
            groups=resnet_groups[0],
            use_gca=True,
            use_recompute=use_recompute) if final_resnet_block else None

        final_conv_dim_in = dim if final_resnet_block else final_conv_dim
        final_conv_dim_in += (channels if lowres_cond else 0)

        self.final_conv = nn.Conv2D(
            final_conv_dim_in,
            self.channels_out,
            final_conv_kernel_size,
            padding=final_conv_kernel_size // 2)

        zero_init_(self.final_conv)

    # if the current settings for the unet are not correct
    # for cascading DDPM, then reinit the unet with the right settings
    def cast_model_parameters(self, *, text_embed_dim, channels, channels_out,
                              cond_on_text):
        if channels == self.channels and \
            cond_on_text == self.cond_on_text and \
            text_embed_dim == self._locals['text_embed_dim'] and \
            channels_out == self.channels_out:
            return self

        updated_kwargs = dict(
            text_embed_dim=text_embed_dim,
            channels=channels,
            channels_out=channels_out,
            cond_on_text=cond_on_text)

        return self.__class__(**{ ** self._locals, ** updated_kwargs})

    # methods for returning the full unet config as well as its parameter state

    def to_config_and_state_dict(self):
        return self._locals, self.state_dict()

    # class method for rehydrating the unet from its config and state dict

    @classmethod
    def from_config_and_state_dict(klass, config, state_dict):
        unet = klass(**config)
        unet.load_state_dict(state_dict)
        return unet

    # methods for persisting unet to disk

    def persist_to_file(self, path):
        path = Path(path)
        path.parents[0].mkdir(exist_ok=True, parents=True)

        config, state_dict = self.to_config_and_state_dict()
        pkg = dict(config=config, state_dict=state_dict)
        paddle.save(pkg, str(path))

    # class method for rehydrating the unet from file saved with `persist_to_file`

    @classmethod
    def hydrate_from_file(klass, path):
        path = Path(path)
        assert path.exists()
        pkg = paddle.load(str(path))

        assert 'config' in pkg and 'state_dict' in pkg
        config, state_dict = pkg['config'], pkg['state_dict']

        return Unet.from_config_and_state_dict(config, state_dict)

    # forward with classifier free guidance

    def forward_with_cond_scale(self, *args, cond_scale=1., **kwargs):
        #print("forward_with_cond_scale.args[1]: ", args[1])
        logits = self.forward(*args, **kwargs)

        if cond_scale == 1:
            return logits

        null_logits = self.forward(*args, cond_drop_prob=1., **kwargs)
        return null_logits + (logits - null_logits) * cond_scale

    def forward(self,
                x,
                time,
                *,
                lowres_cond_img=None,
                lowres_noise_times=None,
                text_embeds=None,
                text_mask=None,
                self_cond=None,
                cond_images=None,
                cond_drop_prob=0.,
                use_recompute=False):
        batch_size = x.shape[0]

        # condition on self

        if self.self_cond:
            self_cond = default(self_cond, lambda: paddle.zeros_like(x))
            x = paddle.concat((x, self_cond), axis=1)

        # add low resolution conditioning, if present

        assert not (self.lowres_cond and not exists(lowres_cond_img)
                    ), 'low resolution conditioning image must be present'
        assert not (self.lowres_cond and not exists(lowres_noise_times)
                    ), 'low resolution conditioning noise time must be present'

        if exists(lowres_cond_img):
            x = paddle.concat((x, lowres_cond_img), axis=1)

        # condition on input image

        assert not (
            self.has_cond_image ^ exists(cond_images)
        ), 'you either requested to condition on an image on the unet, but the conditioning image is not supplied, or vice versa'

        if exists(cond_images):
            assert cond_images.shape[
                1] == self.cond_images_channels, 'the number of channels on the conditioning image you are passing in does not match what you specified on initialiation of the unet'
            cond_images = resize_image_to(cond_images, x.shape[-1])
            x = paddle.concat((cond_images, x), axis=1)

        # initial convolution

        x = self.init_conv(x)

        # init conv residual

        if self.init_conv_to_final_conv_residual:
            init_conv_residual = x.clone()

        # time conditioning

        time_hiddens = self.to_time_hiddens(time)

        # derive time tokens

        time_tokens = self.to_time_tokens(time_hiddens)
        t = self.to_time_cond(time_hiddens)
        if use_recompute:
            t.stop_gradient = True
        # add lowres time conditioning to time hiddens
        # and add lowres time tokens along sequence dimension for attention

        if self.lowres_cond:
            lowres_time_hiddens = self.to_lowres_time_hiddens(
                lowres_noise_times)
            lowres_time_tokens = self.to_lowres_time_tokens(
                lowres_time_hiddens)
            lowres_t = self.to_lowres_time_cond(lowres_time_hiddens)

            t = t + lowres_t
            time_tokens = paddle.concat(
                (time_tokens, lowres_time_tokens), axis=-2)

        # text conditioning

        text_tokens = None

        if exists(text_embeds) and self.cond_on_text:

            # conditional dropout

            text_keep_mask = prob_mask_like((batch_size, ), 1 - cond_drop_prob)

            text_keep_mask_embed = text_keep_mask[:, None, None]
            text_keep_mask_hidden = text_keep_mask[:, None]

            # calculate text embeds

            text_tokens = self.text_to_cond(text_embeds)

            text_tokens = text_tokens[:, :self.max_text_len]

            if exists(text_mask):
                text_mask = text_mask[:, :self.max_text_len]

            text_tokens_len = text_tokens.shape[1]
            remainder = self.max_text_len - text_tokens_len

            if remainder > 0:
                text_tokens = F.pad(text_tokens, (0, remainder),
                                    data_format='NLC')

            if exists(text_mask):
                text_mask = text_mask[:, :, None].cast('float32')
                if remainder > 0:
                    text_mask = F.pad(text_mask, (0, remainder),
                                      data_format='NLC')

                text_keep_mask_embed = text_mask.cast(
                    bool) & text_keep_mask_embed

            null_text_embed = self.null_text_embed.cast(text_tokens.dtype)

            text_tokens = paddle.where(text_keep_mask_embed, text_tokens,
                                       null_text_embed)

            if exists(self.attn_pool):
                text_tokens = self.attn_pool(text_tokens)

            # extra non-attention conditioning by projecting and then summing text embeddings to time
            # termed as text hiddens

            mean_pooled_text_tokens = text_tokens.mean(axis=-2)

            text_hiddens = self.to_text_non_attn_cond(mean_pooled_text_tokens)

            null_text_hidden = self.null_text_hidden.cast(t.dtype)

            text_hiddens = paddle.where(text_keep_mask_hidden, text_hiddens,
                                        null_text_hidden)

            t = t + text_hiddens

        # main conditioning tokens (c)

        c = time_tokens if not exists(text_tokens) else paddle.concat(
            (time_tokens, text_tokens), axis=-2)

        # normalize conditioning tokens

        c = self.norm_cond(c)
        if use_recompute:
            c.stop_gradient = True

        # initial resnet block (for memory efficient unet)

        if exists(self.init_resnet_block):
            x = self.init_resnet_block(x, t)

        hiddens = []

        for pre_downsample, init_block, resnet_blocks, attn_block, post_downsample in self.downs:
            if exists(pre_downsample):
                x = pre_downsample(x)

            x = init_block(x, t, c)

            for resnet_block in resnet_blocks:
                x = resnet_block(x, t)
                hiddens.append(x)

            x = attn_block(x, c)
            hiddens.append(x)

            if exists(post_downsample):
                x = post_downsample(x)

        x = self.mid_block1(x, t, c)

        if exists(self.mid_attn):
            x = self.mid_attn(x)

        x = self.mid_block2(x, t, c)

        add_skip_connection = lambda x: paddle.concat((x, hiddens.pop() * self.skip_connect_scale), axis=1)

        up_hiddens = []

        for init_block, resnet_blocks, attn_block, upsample in self.ups:
            x = add_skip_connection(x)
            x = init_block(x, t, c)

            for resnet_block in resnet_blocks:
                x = add_skip_connection(x)
                x = resnet_block(x, t)

            x = attn_block(x, c)
            up_hiddens.append(x)
            x = upsample(x)

        x = self.upsample_combiner(x, up_hiddens)

        if self.init_conv_to_final_conv_residual:
            x = paddle.concat((x, init_conv_residual), axis=1)

        if exists(self.final_res_block):
            x = self.final_res_block(x, t)

        if exists(lowres_cond_img):
            x = paddle.concat((x, lowres_cond_img), axis=1)

        return self.final_conv(x)


================================================
FILE: ppfleetx/models/multimodal_model/imagen/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from functools import partial, wraps

import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import expm1

# helper functions


def exists(val):
    return val is not None


def identity(t, *args, **kwargs):
    return t


def first(arr, d=None):
    if len(arr) == 0:
        return d
    return arr[0]


def maybe(fn):
    @wraps(fn)
    def inner(x):
        if not exists(x):
            return x
        return fn(x)

    return inner


def once(fn):
    called = False

    @wraps(fn)
    def inner(x):
        nonlocal called
        if called:
            return
        called = True
        return fn(x)

    return inner


print_once = once(print)


def default(val, d):
    if exists(val):
        return val
    return d() if callable(d) else d


def cast_tuple(val, length=None):
    if isinstance(val, list):
        val = tuple(val)

    output = val if isinstance(val, tuple) else ((val, ) * default(length, 1))

    if exists(length):
        assert len(output) == length

    return output


def is_float_dtype(dtype):
    return any([
        dtype == float_dtype
        for float_dtype in (paddle.float64, paddle.float32, paddle.float16,
                            paddle.bfloat16)
    ])


def cast_uint8_images_to_float(images):
    if not images.dtype == paddle.uint8:
        return images
    return images / 255


zeros_ = nn.initializer.Constant(value=0.)


def zero_init_(m):
    zeros_(m.weight)
    if exists(m.bias):
        zeros_(m.bias)


def eval_decorator(fn):
    def inner(model, *args, **kwargs):
        was_training = model.training
        model.eval()
        out = fn(model, *args, **kwargs)
        if was_training:
            model.train(was_training)
        return out

    return inner


def pad_tuple_to_length(t, length, fillvalue=None):
    remain_length = length - len(t)
    if remain_length <= 0:
        return t
    return (*t, *((fillvalue, ) * remain_length))


# helper classes


class Identity(nn.Layer):
    def __init__(self, *args, **kwargs):
        super().__init__()

    def forward(self, x, *args, **kwargs):
        return x


# tensor helpers


def log(t, eps: float=1e-12):
    return paddle.log(t.clip(min=eps))


class Parallel(nn.Layer):
    def __init__(self, *fns):
        super().__init__()
        self.fns = nn.LayerList(fns)

    def forward(self, x):
        outputs = [fn(x) for fn in self.fns]
        return sum(outputs)


def l2norm(t):
    return F.normalize(t, axis=-1)


def right_pad_dims_to(x, t):
    padding_dims = x.ndim - t.ndim
    if padding_dims <= 0:
        return t
    return t.reshape([*t.shape, *((1, ) * padding_dims)])


def masked_mean(t, *, axis, mask=None):
    if not exists(mask):
        return t.mean(axis=axis)

    denom = mask.sum(axis=axis, keepdim=True)
    mask = mask[:, :, None]
    masked_t = paddle.where(mask == 0, paddle.to_tensor(0.), t)

    return masked_t.sum(axis=axis) / denom.clip(min=1e-5)


def resize_image_to(image, target_image_size, clamp_range=None):
    orig_image_size = image.shape[-1]

    if orig_image_size == target_image_size:
        return image

    out = F.interpolate(
        image, (target_image_size, target_image_size), mode='nearest')

    if exists(clamp_range):
        out = out.clip(*clamp_range)

    return out


# image normalization functions
# ddpms expect images to be in the range of -1 to 1


def normalize_neg_one_to_one(img):
    return img * 2 - 1


def unnormalize_zero_to_one(normed_img):
    return (normed_img + 1) * 0.5


# classifier free guidance functions


def prob_mask_like(shape, prob):
    if prob == 1:
        return paddle.ones(shape, dtype=paddle.bool)
    elif prob == 0:
        return paddle.zeros(shape, dtype=paddle.bool)
    else:
        return paddle.zeros(shape).cast('float32').uniform_(0, 1) < prob


def rearrange(tensor,
              pattern: str,
              b: int=-1,
              h: int=-1,
              w: int=-1,
              c: int=-1,
              x: int=-1,
              y: int=-1,
              n: int=-1,
              s1: int=-1,
              s2: int=-1):
    if pattern == 'b n (h d) -> b h n d':
        B, N, _ = tensor.shape
        return tensor.reshape([B, N, h, -1]).transpose([0, 2, 1, 3])
    elif pattern == 'b n (h d) -> (b h) n d':
        B, N, _ = tensor.shape
        return tensor.reshape([B, N, h, -1]).transpose([0, 2, 1, 3]).reshape(
            [B * h, N, -1])
    elif pattern == 'b (h c) x y -> (b h) (x y) c':
        B, _, _, _ = tensor.shape
        return tensor.reshape([B, h, -1, x, y]).transpose(
            [0, 1, 3, 4, 2]).reshape([B * h, x * y, -1])
    elif pattern == 'b n ... -> b n (...)':
        B, N = tensor.shape[:2]
        return tensor.reshape([B, N, -1])
    elif pattern == 'b ... -> b (...)':
        B = tensor.shape[0]
        return tensor.reshape([B, -1])
    elif pattern == 'b j -> b 1 1 j':
        return tensor[:, None, None, :]
    elif pattern == 'b h n d -> b n (h d)':
        B, H, N, D = tensor.shape
        return tensor.transpose([0, 2, 1, 3]).reshape([B, N, -1])
    elif pattern == '(b h) (x y) d -> b (h d) x y':
        _, _, D = tensor.shape
        return tensor.reshape([-1, h, x, y, D]).transpose(
            [0, 1, 4, 2, 3]).reshape([-1, h * D, x, y])
    elif pattern == '(b h) n d -> b n (h d)':
        _, N, D = tensor.shape
        return tensor.reshape([-1, h, N, D]).transpose([0, 2, 1, 3]).reshape(
            [-1, N, h * D])
    elif pattern == 'b n -> b n 1':
        return tensor[:, :, None]
    elif pattern == 'b c h w -> b (h w) c':
        B, C, H, W = tensor.shape
        return tensor.transpose([0, 2, 3, 1]).reshape([B, -1, C])
    elif pattern == 'b (h w) c -> b c h w':
        B, _, C = tensor.shape
        return tensor.reshape([B, h, w, C]).transpose([0, 3, 1, 2])
    elif pattern == 'b (n d) -> b n d':
        B, _ = tensor.shape
        return tensor.reshape([B, n, -1])
    elif pattern == 'b ... -> b 1 ...':
        return tensor[:, None]
    elif pattern == 'b -> b 1 1 1':
        return tensor[:, None, None, None]
    elif pattern == 'b c (h s1) (w s2) -> b (c s1 s2) h w':
        assert s1 is not None
        assert s2 is not None
        B, C, H, W = tensor.shape
        tensor = tensor.reshape([B, C, H // s1, s1, W // s2, s2])
        tensor = tensor.transpose([0, 1, 3, 5, 2, 4])
        return tensor.reshape([B, C * s1 * s2, H // s1, W // s2])


def rearrange_many(tensors, pattern: str, h: int=-1, x: int=-1, y: int=-1):
    assert isinstance(tensors, (
        list, tuple)), "rearrange_many type must be list or tuple"
    if isinstance(tensors, tuple):
        tensors = list(tensors)
    if len(tensors) == 0:
        raise TypeError("Rearrange can't be applied to an empty list")
    for i, tensor in enumerate(tensors):
        tensors[i] = rearrange(tensor, pattern, h=h, x=x, y=y)
    return tensors


def repeat(tensor, pattern: str, h: int=-1, b: int=-1):
    if pattern == '1 -> b':
        if b > 1:
            b = paddle.to_tensor([b])
            return paddle.tile(tensor, repeat_times=b)
        else:
            return tensor
    elif pattern == 't -> b t':
        tensor = tensor[None, :]
        return paddle.tile(tensor, repeat_times=(b, 1))
    elif pattern == 'n d -> b n d':
        tensor = tensor[None, :]
        return paddle.tile(tensor, repeat_times=(b, 1, 1))
    elif pattern == 'o ... -> (o 4) ...':
        return paddle.tile(tensor, repeat_times=(4, 1, 1, 1))
    elif pattern == 'd -> b h 1 d':
        tensor = tensor[None, None, None, :]
        return paddle.tile(tensor, repeat_times=(b, h, 1, 1))
    elif pattern == 'd -> b 1 d':
        tensor = tensor[None, None, :]
        return paddle.tile(tensor, repeat_times=(b, 1, 1))


def repeat_many(tensors, pattern: str, h: int=-1, b: int=-1):
    assert isinstance(tensors, (list, tuple))
    if isinstance(tensors, tuple):
        tensors = list(tensors)
    if len(tensors) == 0:
        raise TypeError("Rearrange can't be applied to an empty list")
    for i, tensor in enumerate(tensors):
        tensors[i] = repeat(tensor, pattern, h=h, b=b)
    return tensors


def reduce(losses, pattern: str, reduction: str='mean'):
    if pattern == 'b ... -> b':
        axes = list(range(1, len(losses.shape)))
        return losses.mean(axes)


class EinopsToAndFrom(nn.Layer):
    def __init__(self, from_einops, to_einops, fn):
        super().__init__()
        self.from_einops = from_einops
        self.to_einops = to_einops
        self.fn = fn

    def forward(self, x, **kwargs):
        shape = x.shape
        reconstitute_kwargs = dict(
            tuple(zip(self.from_einops.split(' '), shape)))
        x = rearrange(x, f'{self.from_einops} -> {self.to_einops}')
        x = self.fn(x, **kwargs)
        x = rearrange(x, f'{self.to_einops} -> {self.from_einops}',
                      **reconstitute_kwargs)
        return x


class Rearrange(nn.Layer):
    def __init__(self, pattern, n=None, s1=None, s2=None):
        super().__init__()
        self.pattern = pattern
        self.n = n
        self.s1 = s1
        self.s2 = s2

    def forward(self, x, **kwargs):
        x = rearrange(x, f'{self.pattern}', n=self.n, s1=self.s1, s2=self.s2)
        return x


# classifier free guidance functions

# gaussian diffusion with continuous time helper functions and classes
# large part of this was thanks to @crowsonkb at https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/utils.py


def beta_linear_log_snr(t):
    return -paddle.log(expm1(1e-4 + 10 * (t**2)))


def alpha_cosine_log_snr(t, s: float=0.008):
    return -log(
        (paddle.cos((t + s) / (1 + s) * math.pi * 0.5)**-2) - 1, eps=1e-5
    )  # not sure if this accounts for beta being clipped to 0.999 in discrete version


def log_snr_to_alpha_sigma(log_snr):
    return paddle.sqrt(F.sigmoid(log_snr)), paddle.sqrt(F.sigmoid(-log_snr))


class GaussianDiffusionContinuousTimes(nn.Layer):
    def __init__(self, *, noise_schedule, timesteps=1000):
        super().__init__()

        if noise_schedule == 'linear':
            self.log_snr = beta_linear_log_snr
        elif noise_schedule == "cosine":
            self.log_snr = alpha_cosine_log_snr
        else:
            raise ValueError(f'invalid noise schedule {noise_schedule}')

        self.num_timesteps = timesteps

    def get_times(self, batch_size, noise_level):
        return paddle.full((batch_size, ), noise_level, dtype=paddle.float32)

    def sample_random_times(self, batch_size):
        return paddle.zeros((batch_size, )).cast('float32').uniform_(0, 1)

    def get_condition(self, times):
        return maybe(self.log_snr)(times)

    def get_sampling_timesteps(self, batch):
        times = paddle.linspace(1., 0., self.num_timesteps + 1)
        times = repeat(times, 't -> b t', b=batch)
        times = paddle.stack((times[:, :-1], times[:, 1:]), axis=0)
        times = times.unbind(axis=-1)
        return times

    def q_posterior(self, x_start, x_t, t, *, t_next=None):
        t_next = default(
            t_next, lambda: (t - 1. / self.num_timesteps).clip(min=0.))
        """ https://openreview.net/attachment?id=2LdBqxc1Yv&name=supplementary_material """
        log_snr = self.log_snr(t)
        log_snr_next = self.log_snr(t_next)
        log_snr, log_snr_next = map(
            partial(right_pad_dims_to, x_t), (log_snr, log_snr_next))

        alpha, sigma = log_snr_to_alpha_sigma(log_snr)
        alpha_next, sigma_next = log_snr_to_alpha_sigma(log_snr_next)

        # c - as defined near eq 33
        c = -expm1(log_snr - log_snr_next)
        posterior_mean = alpha_next * (x_t * (1 - c) / alpha + c * x_start)

        # following (eq. 33)
        posterior_variance = (sigma_next**2) * c
        posterior_log_variance_clipped = log(posterior_variance, eps=1e-20)
        return posterior_mean, posterior_variance, posterior_log_variance_clipped

    def q_sample(self, x_start, t, noise=None):
        dtype = x_start.dtype

        if isinstance(t, float):
            batch = x_start.shape[0]
            t = paddle.full((batch, ), t, dtype=dtype)

        noise = default(noise, lambda: paddle.randn(shape=x_start.shape, dtype=dtype))
        log_snr = self.log_snr(t).cast(dtype)
        log_snr_padded_dim = right_pad_dims_to(x_start, log_snr)
        alpha, sigma = log_snr_to_alpha_sigma(log_snr_padded_dim)

        return alpha * x_start + sigma * noise, log_snr, alpha, sigma

    def q_sample_from_to(self, x_from, from_t, to_t, noise=None):
        shape, dtype = x_from.shape, x_from.dtype
        batch = shape[0]

        if isinstance(from_t, float):
            from_t = paddle.full((batch, ), from_t, dtype=dtype)

        if isinstance(to_t, float):
            to_t = paddle.full((batch, ), to_t, dtype=dtype)

        noise = default(noise, lambda: paddle.randn(shape=x_from.shape, dtype=x_from.dtype))

        log_snr = self.log_snr(from_t)
        log_snr_padded_dim = right_pad_dims_to(x_from, log_snr)
        alpha, sigma = log_snr_to_alpha_sigma(log_snr_padded_dim)

        log_snr_to = self.log_snr(to_t)
        log_snr_padded_dim_to = right_pad_dims_to(x_from, log_snr_to)
        alpha_to, sigma_to = log_snr_to_alpha_sigma(log_snr_padded_dim_to)

        return x_from * (alpha_to / alpha) + noise * (sigma_to * alpha - sigma
                                                      * alpha_to) / alpha

    def predict_start_from_v(self, x_t, t, v):
        log_snr = self.log_snr(t)
        log_snr = right_pad_dims_to(x_t, log_snr)
        alpha, sigma = log_snr_to_alpha_sigma(log_snr)
        return alpha * x_t - sigma * v

    def predict_start_from_noise(self, x_t, t, noise):
        log_snr = self.log_snr(t)
        log_snr = right_pad_dims_to(x_t, log_snr)
        alpha, sigma = log_snr_to_alpha_sigma(log_snr)
        return (x_t - sigma * noise) / alpha.clip(min=1e-8)


class Always():
    def __init__(self, val):
        self.val = val

    def __call__(self, *args, **kwargs):
        return self.val


================================================
FILE: ppfleetx/models/multimodal_model/multimodal_module.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import copy

import paddle

from ppfleetx.core.module.basic_module import BasicModule
import ppfleetx.models.multimodal_model.imagen as imagen
from ppfleetx.utils.log import logger

from .utils import process_configs


class MultiModalModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        super(MultiModalModule, self).__init__(configs)

        self.loss_fn = self.get_loss_fn()

    def process_configs(self, configs):
        configs = process_configs(configs)
        return configs

    def forward(self, batch):
        return self.model(**batch)

    def training_step(self, batch):
        preds, targets, log_snr, p2_loss_weight_gamma = self(batch)
        loss = self.loss_fn(preds, targets, log_snr, p2_loss_weight_gamma)
        return loss

    def training_step_end(self, log_dict):
        speed = self.configs.Engine.logging_freq / log_dict['train_cost']

        logger.info(
            "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, learning rate: %.5e"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               1. / speed, speed, log_dict['lr']))

    def validation_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch
        preds = self(tokens, position_ids)
        preds = paddle.cast(preds, dtype="float32")
        loss = self.loss_fn(preds, labels, loss_mask)
        return loss

    def validation_step_end(self, log_dict):
        speed = self.configs.Engine.logging_freq / log_dict['eval_cost']
        logger.info(
            "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               1. / speed, speed))

    def test_step(self, batch):
        tokens, position_ids, labels, loss_mask = batch
        preds = self(tokens, position_ids)
        preds = paddle.cast(preds, dtype="float32")
        loss = self.loss_fn(preds, labels, loss_mask)
        return loss

    def test_step_end(self, log_dict):
        speed = self.configs.Engine.logging_freq / log_dict['test_cost']
        logger.info(
            "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['loss'],
               1. / speed, speed))

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, None], name="tokens", dtype='int64'), InputSpec(
                    shape=[None, None], name="ids", dtype='int64')
        ]

    def training_epoch_end(self, log_dict):
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (log_dict['epoch'], log_dict['train_cost']))


class ImagenModule(MultiModalModule):
    def __init__(self, configs):
        super(ImagenModule, self).__init__(configs)

    def get_model(self):
        model_setting = copy.deepcopy(self.configs.Model)
        model_setting.pop("module")
        imagen_model = model_setting.pop("name")
        model = getattr(imagen, imagen_model)(**model_setting)
        return model

    def get_loss_fn(self):
        model_setting = copy.deepcopy(self.configs.Loss)
        loss_fn = imagen.ImagenCriterion(**model_setting)
        return loss_fn

    def pretreating_batch(self, batch):
        return batch


================================================
FILE: ppfleetx/models/multimodal_model/utils.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import sys
import copy

import yaml
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.fluid import core
import argparse
from functools import reduce

from ppfleetx.distributed.apis import env


def process_global_configs(config):
    """
    process global configs for hybrid parallel
    """
    dp_degree = config['Distributed']['dp_degree']
    sharding_degree = config['Distributed']['sharding']['sharding_degree']

    configs = config['Global']
    if configs['global_batch_size'] is None and configs[
            'local_batch_size'] is None:
        raise ValueError(
            "global_batch_size or local_batch_size should be set.")
    elif configs['global_batch_size'] is not None and configs[
            'local_batch_size'] is not None:
        assert configs['global_batch_size'] // configs['local_batch_size'] == (dp_degree * sharding_degree), "global_batch_size[{}] should be divided by local_batch_size[{}] "\
            "when dp_degree is [{}] and sharding_degree is [{}]".format(configs['global_batch_size'],
            configs['local_batch_size'], dp_degree, sharding_degree)
    elif configs['global_batch_size'] is not None and configs[
            'local_batch_size'] is None:
        assert configs['global_batch_size'] % (dp_degree * sharding_degree) == 0, \
            "global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]"\
            .format(configs['global_batch_size'], dp_degree, sharding_degree)
        configs['local_batch_size'] = configs['global_batch_size'] // (
            dp_degree * sharding_degree)
    else:
        configs['global_batch_size'] = configs[
            'local_batch_size'] * dp_degree * sharding_degree
    assert configs['local_batch_size'] % configs['micro_batch_size'] == 0


def is_fused_matmul_bias_supported():
    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
        return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')
    else:
        return False


def process_fused_configs(config):
    """
    process fused configs for hybrid parallel
    """

    nranks = dist.get_world_size()
    dp_degree = config['Distributed']['dp_degree']
    configs = config['Fused']
    if configs['tensor_fusion']:
        assert nranks == dp_degree, "tensor_fusion only support single card train or data parallel train"


def process_inference_configs(config):
    """
    process fused configs for hybrid parallel
    """
    configs = config['Inference']

    if configs['model_dir'] is None:
        configs['model_dir'] = config['Engine']['save_load']['output_dir']

    if configs['mp_degree'] is None:
        configs['mp_degree'] = config['Distributed']['mp_degree']


def process_model_configs(config):
    """
    process model configs for hybrid parallel
    """
    configs = config['Model']

    if configs['use_recompute']:
        if not configs['recompute_granularity']:
            configs['recompute_granularity'] = 'full'

    if configs['fused_linear'] and not is_fused_matmul_bias_supported():
        configs['fused_linear'] = False
        logging.warning(
            "The flag fused_linear only valid for cuda version higher than 11.6, "
            "but the paddle is compiled with cuda " + paddle.version.cuda())


def process_optim_configs(config):
    """
    process optim configs for hybrid parallel
    """
    config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][
        'enable']


def process_engine_configs(config):
    """
    process engine configs for hybrid parallel
    """
    configs = config['Engine']
    configs['test_iters'] = configs['eval_iters'] * 10 \
        if configs.get('test_iters', None) is None \
        else configs['test_iters']
    configs['accumulate_steps'] = config['Global']['local_batch_size'] \
        // config['Global']['micro_batch_size']


def process_configs(config):

    process_fused_configs(config)
    process_model_configs(config)
    process_optim_configs(config)
    process_inference_configs(config)

    return config


================================================
FILE: ppfleetx/models/protein_folding/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/protein_folding/all_atom.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, Optional
import paddle

from .common import (batched_gather, )

from . import (
    residue_constants,
    r3, )


def get_chi_atom_indices():
    """Returns atom indices needed to compute chi angles for all residue types.

    Returns:
        A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are
        in the order specified in residue_constants.restypes + unknown residue type
        at the end. For chi angles which are not defined on the residue, the
        positions indices are by default set to 0.
    """
    chi_atom_indices = []
    for residue_name in residue_constants.restypes:
        residue_name = residue_constants.restype_1to3[residue_name]
        residue_chi_angles = residue_constants.chi_angles_atoms[residue_name]
        atom_indices = []
        for chi_angle in residue_chi_angles:
            atom_indices.append(
                [residue_constants.atom_order[atom] for atom in chi_angle])
        for _ in range(4 - len(atom_indices)):
            atom_indices.append(
                [0, 0, 0, 0])  # For chi angles not defined on the AA.
        chi_atom_indices.append(atom_indices)

    chi_atom_indices.append([[0, 0, 0, 0]] * 4)  # For UNKNOWN residue.

    return paddle.to_tensor(chi_atom_indices)


def atom37_to_torsion_angles(
        aatype: paddle.Tensor,  # (B, T, N)
        all_atom_pos: paddle.Tensor,  # (B, T, N, 37, 3)
        all_atom_mask: paddle.Tensor,  # (B, T, N, 37)
        placeholder_for_undefined=False, ) -> Dict[str, paddle.Tensor]:
    """Computes the 7 torsion angles (in sin, cos encoding) for each residue.

    The 7 torsion angles are in the order
    '[pre_omega, phi, psi, chi_1, chi_2, chi_3, chi_4]',
    here pre_omega denotes the omega torsion angle between the given amino acid
    and the previous amino acid.

    Args:
        aatype: Amino acid type, given as array with integers.
        all_atom_pos: atom37 representation of all atom coordinates.
        all_atom_mask: atom37 representation of mask on all atom coordinates.
        placeholder_for_undefined: flag denoting whether to set masked torsion
        angles to zero.
    Returns:
        Dict containing:
        * 'torsion_angles_sin_cos': Array with shape (B, N, 7, 2) where the final
            2 dimensions denote sin and cos respectively
        * 'alt_torsion_angles_sin_cos': same as 'torsion_angles_sin_cos', but
            with the angle shifted by pi for all chi angles affected by the naming
            ambiguities.
        * 'torsion_angles_mask': Mask for which chi angles are present.
    """

    # Map aatype > 20 to 'Unknown' (20).
    aatype = paddle.minimum(
        aatype.astype('int'),
        paddle.full(
            shape=[1], fill_value=20, dtype='int'))

    num_batch, num_temp, num_res = aatype.shape

    # Compute the backbone angles.
    pad = paddle.zeros([num_batch, num_temp, 1, 37, 3])
    prev_all_atom_pos = paddle.concat(
        [pad, all_atom_pos[..., :-1, :, :]], axis=-3)

    pad = paddle.zeros([num_batch, num_temp, 1, 37])
    prev_all_atom_mask = paddle.concat(
        [pad, all_atom_mask[..., :-1, :]], axis=-2)

    # For each torsion angle collect the 4 atom positions that define this angle.
    # shape (B, T, N, atoms=4, xyz=3)
    pre_omega_atom_pos = paddle.concat(
        [
            prev_all_atom_pos[..., 1:3, :],  # prev CA, C
            all_atom_pos[..., 0:2, :]  # this N, CA
        ],
        axis=-2)

    phi_atom_pos = paddle.concat(
        [
            prev_all_atom_pos[..., 2:3, :],  # prev C
            all_atom_pos[..., 0:3, :]  # this N, CA, C
        ],
        axis=-2)

    psi_atom_pos = paddle.concat(
        [
            all_atom_pos[..., 0:3, :],  # this N, CA, C
            all_atom_pos[..., 4:5, :]  # this O
        ],
        axis=-2)

    # Collect the masks from these atoms.
    # Shape [batch, n_temp, num_res]
    pre_omega_mask = (
        paddle.prod(
            prev_all_atom_mask[..., 1:3], axis=-1)  # prev CA, C
        * paddle.prod(
            all_atom_mask[..., 0:2], axis=-1))  # this N, CA
    phi_mask = (
        prev_all_atom_mask[..., 2]  # prev C
        * paddle.prod(
            all_atom_mask[..., 0:3], axis=-1))  # this N, CA, C
    psi_mask = (
        paddle.prod(
            all_atom_mask[..., 0:3], axis=-1) *  # this N, CA, C
        all_atom_mask[..., 4])  # this O

    # Collect the atoms for the chi-angles.
    # Compute the table of chi angle indices. Shape: [restypes, chis=4, atoms=4].
    chi_atom_indices = get_chi_atom_indices()

    # Select atoms to compute chis. Shape: [batch, num_temp, num_res, chis=4, atoms=4].
    atom_indices = batched_gather(
        params=chi_atom_indices, indices=aatype, axis=0, batch_dims=0)

    # Gather atom positions. Shape: [batch, num_temp, num_res, chis=4, atoms=4, xyz=3].
    chis_atom_pos = batched_gather(
        params=all_atom_pos, indices=atom_indices, axis=0, batch_dims=3)

    # Copy the chi angle mask, add the UNKNOWN residue. Shape: [restypes, 4].
    chi_angles_mask = list(residue_constants.chi_angles_mask)
    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
    chi_angles_mask = paddle.to_tensor(chi_angles_mask)

    # Compute the chi angle mask. I.e. which chis angles exist according to the
    # aatype. Shape [batch, num_temp, num_res, chis=4].
    chis_mask = batched_gather(
        params=chi_angles_mask, indices=aatype, axis=0, batch_dims=0)
    # Constrain the chis_mask to those chis, where the ground truth coordinates of
    # all defining four atoms are available.
    # Gather the chi angle atoms mask. Shape: [batch, num_temp, num_res, chis=4, atoms=4].
    chi_angle_atoms_mask = batched_gather(
        params=all_atom_mask, indices=atom_indices, axis=0, batch_dims=3)
    # Check if all 4 chi angle atoms were set. Shape: [batch, num_temp, num_res, chis=4].
    chi_angle_atoms_mask = paddle.prod(chi_angle_atoms_mask, axis=[-1])
    chis_mask = chis_mask * chi_angle_atoms_mask

    # Stack all torsion angle atom positions.
    # Shape (B, T, N, torsions=7, atoms=4, xyz=3)
    torsions_atom_pos = paddle.concat(
        [
            pre_omega_atom_pos.unsqueeze(axis=-3),  # [:, :, :, None, :, :]
            phi_atom_pos.unsqueeze(axis=-3),  # [:, :, :, None, :, :]
            psi_atom_pos.unsqueeze(axis=-3),  # [:, :, :, None, :, :]
            chis_atom_pos
        ],
        axis=3)

    # Stack up masks for all torsion angles.
    # shape (B, T, N, torsions=7)
    torsion_angles_mask = paddle.concat(
        [
            pre_omega_mask.unsqueeze(axis=-1),  # [..., None]
            phi_mask.unsqueeze(axis=-1),  # [..., None]
            psi_mask.unsqueeze(axis=-1),  # [..., None]
            chis_mask
        ],
        axis=-1)

    # Create a frame from the first three atoms:
    # First atom: point on x-y-plane
    # Second atom: point on negative x-axis
    # Third atom: origin
    # r3.Rigids (B, T, N, torsions=7)
    torsion_frames = r3.rigids_from_3_points_vecs(
        point_on_neg_x_axis=r3.Vecs(torsions_atom_pos[..., 1, :]),
        origin=r3.Vecs(torsions_atom_pos[..., 2, :]),
        point_on_xy_plane=r3.Vecs(torsions_atom_pos[..., 0, :]))

    # Compute the position of the forth atom in this frame (y and z coordinate
    # define the chi angle)
    # r3.Vecs (B, T, N, torsions=7)
    forth_atom_rel_pos = r3.rigids_mul_vecs(
        r3.invert_rigids(torsion_frames),
        r3.vecs_from_tensor(torsions_atom_pos[..., 3, :]))

    # Normalize to have the sin and cos of the torsion angle.
    # paddle.Tensor (B, T, N, torsions=7, sincos=2)
    torsion_angles_sin_cos = paddle.stack(
        [forth_atom_rel_pos.z, forth_atom_rel_pos.y], axis=-1)
    torsion_angles_sin_cos /= paddle.sqrt(
        paddle.sum(paddle.square(torsion_angles_sin_cos),
                   axis=-1,
                   keepdim=True) + 1e-8)

    # Mirror psi, because we computed it from the Oxygen-atom.
    torsion_angles_sin_cos *= paddle.to_tensor(
        [1., 1., -1., 1., 1., 1., 1.]).reshape(
            [1, 1, 1, 7, 1])  # [None, None, None, :, None]

    # Create alternative angles for ambiguous atom names.
    chi_is_ambiguous = batched_gather(
        paddle.to_tensor(residue_constants.chi_pi_periodic), aatype)
    # chi_is_ambiguous (B, T, N, torsions=4)
    mirror_torsion_angles = paddle.concat(
        [
            paddle.ones([num_batch, num_temp, num_res, 3]),
            1.0 - 2.0 * chi_is_ambiguous
        ],
        axis=-1)
    # mirror_torsion_angles (B, T, N, torsions=7)
    alt_torsion_angles_sin_cos = torsion_angles_sin_cos * mirror_torsion_angles.unsqueeze(
        axis=-1)  # [:, :, :, :, None]

    if placeholder_for_undefined:
        # Add placeholder torsions in place of undefined torsion angles
        # (e.g. N-terminus pre-omega)
        placeholder_torsions = paddle.stack(
            [
                paddle.ones(torsion_angles_sin_cos.shape[:-1]),
                paddle.zeros(torsion_angles_sin_cos.shape[:-1])
            ],
            axis=-1)
        torsion_angles_sin_cos = torsion_angles_sin_cos * torsion_angles_mask.unsqueeze(
            axis=-1) + placeholder_torsions * (
                1 - torsion_angles_mask.unsqueeze(axis=-1))
        alt_torsion_angles_sin_cos = alt_torsion_angles_sin_cos * torsion_angles_mask.unsqueeze(
            axis=-1) + placeholder_torsions * (
                1 - torsion_angles_mask.unsqueeze(axis=-1))

    return {
        'torsion_angles_sin_cos': torsion_angles_sin_cos,  # (B, T, N, 7, 2)
        'alt_torsion_angles_sin_cos':
        alt_torsion_angles_sin_cos,  # (B, T, N, 7, 2)
        'torsion_angles_mask': torsion_angles_mask  # (B, T, N, 7)
    }


================================================
FILE: ppfleetx/models/protein_folding/attentions.py
================================================
"""attentions.py."""
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import numpy as np
import paddle
import paddle.nn as nn

try:
    from paddle import _legacy_C_ops as _C_ops
except:
    from paddle import _C_ops

from ppfleetx.distributed.protein_folding import dap

from .common import (
    init_gate_linear,
    init_final_linear,
    mask_mean,
    subbatch, )


class Attention(nn.Layer):
    """Multihead attention."""

    def __init__(self, config, global_config, q_dim, kv_dim, output_dim):
        super(Attention, self).__init__()
        self.config = config
        self.global_config = global_config

        num_head = self.config.num_head
        key_dim = self.config.get('key_dim', q_dim)
        value_dim = self.config.get('value_dim', kv_dim)

        # TODO(GuoxiaWang): delete non fuse_attention related code on dcu
        self.fuse_attention = self.global_config.fuse_attention
        self.use_flash_attn = self.global_config.use_flash_attn
        self.merge_qkv = (q_dim == kv_dim)

        assert key_dim % num_head == 0
        assert value_dim % num_head == 0
        key_dim = key_dim // num_head
        value_dim = value_dim // num_head

        self.key_dim = key_dim
        self.value_dim = value_dim

        self.qkv_w = None
        self.query_w = None
        self.key_w = None
        self.value_w = None
        if self.merge_qkv and self.fuse_attention:
            self.qkv_w = paddle.create_parameter(
                [3, num_head, key_dim, q_dim],
                'float32',
                default_initializer=nn.initializer.XavierUniform())
        else:
            self.query_w = paddle.create_parameter(
                [q_dim, num_head, key_dim],
                'float32',
                default_initializer=nn.initializer.XavierUniform())
            self.key_w = paddle.create_parameter(
                [kv_dim, num_head, key_dim],
                'float32',
                default_initializer=nn.initializer.XavierUniform())
            self.value_w = paddle.create_parameter(
                [kv_dim, num_head, value_dim],
                'float32',
                default_initializer=nn.initializer.XavierUniform())

        self.gating_w = None
        self.gating_b = None
        if self.config.gating:
            self.gating_w = paddle.create_parameter(
                [q_dim, num_head, value_dim],
                'float32',
                default_initializer=nn.initializer.Constant(0.0))
            self.gating_b = paddle.create_parameter(
                [num_head, value_dim],
                'float32',
                default_initializer=nn.initializer.Constant(1.0))

        if self.global_config.zero_init:
            init = nn.initializer.Constant(0.0)
        else:
            init = nn.initializer.XavierUniform()

        self.output_w = paddle.create_parameter(
            [num_head, value_dim, output_dim],
            'float32',
            default_initializer=init)
        self.output_b = paddle.create_parameter(
            [output_dim],
            'float32',
            default_initializer=nn.initializer.Constant(0.0))

    def forward(self, q_data, m_data, bias, nonbatched_bias=None):
        """Builds Attention module.
        
        Args:
            q_data (float): A tensor of queries, shape [batch, row_size, N_queries, q_channels].
            m_data (float): A tensor of memories from which the keys and values are
                projected, shape [batch, row_size, N_keys, m_channels].
            bias (float): A bias for the attention, shape [batch, row_size, num_head, N_queries, N_keys].
            nonbatched_bias (float): Shared bias, shape [N_queries, N_keys].

        Returns:
            A float32 tensor of shape [batch_size, row_size, N_queries, output_dim].
        """
        if self.fuse_attention:
            if nonbatched_bias is not None:
                nonbatched_bias = paddle.unsqueeze(nonbatched_bias, axis=1)

            import paddle.incubate.nn.functional as F
            output = F.fused_gate_attention(
                query=q_data,
                key=m_data,
                query_weight=self.query_w,
                key_weight=self.key_w,
                value_weight=self.value_w,
                qkv_weight=self.qkv_w,
                gate_linear_weight=self.gating_w,
                gate_linear_bias=self.gating_b,
                out_linear_weight=self.output_w,
                out_linear_bias=self.output_b,
                nonbatched_bias=nonbatched_bias,
                attn_mask=bias,
                has_gating=self.config.gating,
                merge_qkv=self.merge_qkv,
                use_flash_attn=self.use_flash_attn, )
        else:
            c = self.key_dim**(-0.5)
            q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) * c
            k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
            v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
            logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k) + bias

            if nonbatched_bias is not None:
                logits += paddle.unsqueeze(nonbatched_bias, axis=1)

            weights = nn.functional.softmax(logits)
            weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)

            if self.config.gating:
                gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,
                                            self.gating_w) + self.gating_b
                gate_values = nn.functional.sigmoid(gate_values)
                weighted_avg *= gate_values

            output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
                                   self.output_w) + self.output_b
        return output


class GlobalAttention(nn.Layer):
    """Global attention.

    Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention" lines 2-7
    """

    def __init__(self, config, global_config, q_dim, kv_dim, output_dim):
        super(GlobalAttention, self).__init__()
        self.config = config
        self.global_config = global_config

        num_head = self.config.num_head
        key_dim = self.config.get('key_dim', q_dim)
        value_dim = self.config.get('value_dim', kv_dim)

        assert key_dim % num_head == 0
        assert value_dim % num_head == 0
        key_dim = key_dim // num_head
        value_dim = value_dim // num_head

        self.key_dim = key_dim
        self.value_dim = value_dim

        self.query_w = paddle.create_parameter(
            [q_dim, num_head, key_dim],
            'float32',
            default_initializer=nn.initializer.XavierUniform())
        self.key_w = paddle.create_parameter(
            [kv_dim, key_dim],
            'float32',
            default_initializer=nn.initializer.XavierUniform())
        self.value_w = paddle.create_parameter(
            [kv_dim, value_dim],
            'float32',
            default_initializer=nn.initializer.XavierUniform())

        if self.config.gating:
            self.gating_w = paddle.create_parameter(
                [q_dim, num_head, value_dim],
                'float32',
                default_initializer=nn.initializer.Constant(0.0))
            self.gating_b = paddle.create_parameter(
                [num_head, value_dim],
                'float32',
                default_initializer=nn.initializer.Constant(1.0))

        if self.global_config.zero_init:
            init = nn.initializer.Constant(0.0)
        else:
            init = nn.initializer.XavierUniform()

        self.output_w = paddle.create_parameter(
            [num_head, value_dim, output_dim],
            'float32',
            default_initializer=init)
        self.output_b = paddle.create_parameter(
            [output_dim],
            'float32',
            default_initializer=nn.initializer.Constant(0.0))

    def forward(self, q_data, m_data, q_mask):
        """Builds Attention module.
        
        Args:
            q_data (float): A tensor of queries, shape [batch, row_size, N_queries, q_channels].
            m_data (float): A tensor of memories from which the keys and values are
                projected, shape [batch, row_size, N_keys, m_channels].
            q_mask (float): A tensor of mask.

        Returns:
            A float32 tensor of output.
        """

        k = paddle.einsum('nbka,ac->nbkc', m_data, self.key_w)
        v = paddle.einsum('nbka,ac->nbkc', m_data, self.value_w)

        # NOTE: differ from non-global version using q_avg for attn
        q_avg = mask_mean(q_mask, q_data, axis=2)
        c = self.key_dim**(-0.5)
        q = paddle.einsum('nba,ahc->nbhc', q_avg, self.query_w) * c

        q_mask_ = paddle.unsqueeze(q_mask, axis=2)[..., 0]
        bias = 1e9 * (q_mask_ - 1.)

        logits = paddle.einsum('nbhc,nbkc->nbhk', q, k) + bias
        weights = nn.functional.softmax(logits)
        weighted_avg = paddle.einsum('nbhk,nbkc->nbhc', weights, v)

        if self.config.gating:
            gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,
                                        self.gating_w) + self.gating_b
            gate_values = nn.functional.sigmoid(gate_values)
            weighted_avg = paddle.unsqueeze(weighted_avg, axis=2)
            weighted_avg *= gate_values

            output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
                                   self.output_w) + self.output_b
        else:
            output = paddle.einsum('nbhc,hco->nbo', weighted_avg,
                                   self.output_w) + self.output_b
            output = paddle.unsqueeze(output, axis=-1)

        return output


class MSARowAttentionWithPairBias(nn.Layer):
    """MSA per-row attention biased by the pair representation.

    Jumper et al. (2021) Suppl. Alg. 7 "MSARowAttentionWithPairBias"
    """

    def __init__(self, channel_num, config, global_config, is_extra_msa):
        super(MSARowAttentionWithPairBias, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config
        self.is_extra_msa = is_extra_msa
        assert config.orientation == 'per_row'

        if is_extra_msa:
            self.query_norm = nn.LayerNorm(channel_num['extra_msa_channel'])
        else:
            self.query_norm = nn.LayerNorm(channel_num['msa_channel'])

        self.feat_2d_norm = nn.LayerNorm(channel_num['pair_channel'])
        self.feat_2d_weights = paddle.create_parameter(
            [channel_num['pair_channel'], self.config.num_head],
            'float32',
            default_initializer=nn.initializer.Normal(
                std=1. / np.sqrt(channel_num['pair_channel'])))

        if is_extra_msa:
            extra_msa_channel = channel_num['extra_msa_channel']
            self.attention = Attention(self.config, self.global_config,
                                       extra_msa_channel, extra_msa_channel,
                                       extra_msa_channel)
        else:
            msa_channel = channel_num['msa_channel']
            self.attention = Attention(self.config, self.global_config,
                                       msa_channel, msa_channel, msa_channel)

    def forward(self, msa_act, msa_mask, pair_act):
        """MSARowAttention with masks.
        
        Args:
            msa_act (float): A tensor of msa_act.
            msa_mask (float): A tensor of msa_mask.
            pair_act (float): A tensor of pair_act.

        Returns:
            A float32 tensor of msa_act.
        """

        pair_act = self.feat_2d_norm(pair_act)

        # [B, N_res//dap_size, N_res, cz], [cz, head] => [B, head, N_res//dap_size, N_res]
        nonbatched_bias_before = paddle.einsum('nqkc,ch->nhqk', pair_act,
                                               self.feat_2d_weights)

        # [B, head, N_res//dap_size, N_res] => [B, head, N_res, N_res]
        nonbatched_bias = dap.all_gather(nonbatched_bias_before, axis=2)
        # if not self.training:
        if not self.training and self.global_config.low_memory is True:
            del nonbatched_bias_before
            gc.collect()
        nonbatched_bias = dap.all_gather_opp(nonbatched_bias, axis=2)

        # [B, N_seq, N_res] => [B, N_seq//dap_size, N_res]
        msa_mask = dap.scatter(msa_mask, axis=1)

        bias = 1e9 * (msa_mask - 1.)
        # [B, N_seq//dap_size, N_res] => [B, N_seq//dap_size, 1, 1, N_res]
        bias = paddle.unsqueeze(bias, axis=[2, 3])
        msa_act = self.query_norm(msa_act)

        if not self.training or (self.is_extra_msa and
                                 self.config.use_subbatch):
            # low memory mode using subbatch
            subbatch_size = self.config.subbatch_size
            if not self.training:
                subbatch_size = self.global_config.subbatch_size
            sb_attn = subbatch(
                self.attention, [0, 1, 2], [1, 1, 1],
                subbatch_size,
                1,
                same_arg_idx={1: 0})
            msa_act = sb_attn(msa_act, msa_act, bias, nonbatched_bias)
        else:
            msa_act = self.attention(msa_act, msa_act, bias, nonbatched_bias)

        return msa_act


class MSAColumnGlobalAttention(nn.Layer):
    """MSA per-column global attention.

    Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention"
    """

    def __init__(self, channel_num, config, global_config):
        super(MSAColumnGlobalAttention, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config
        assert config.orientation == 'per_column'

        extra_msa_channel = channel_num['extra_msa_channel']
        self.query_norm = nn.LayerNorm(extra_msa_channel)
        self.attention = GlobalAttention(self.config, self.global_config,
                                         extra_msa_channel, extra_msa_channel,
                                         extra_msa_channel)

    def forward(self, msa_act, msa_mask):
        """MSAColumnGlobalAttention.
        
        Args:
            msa_act (float): A tensor of msa_act.
            msa_mask (float): A tensor of msa_mask.

        Returns:
            A float32 tensor of msa_act.
        """

        # scatter if using dap, otherwise do nothing
        # [B, N_seq, N_res] => [B, N_seq, N_res//dap_size]
        msa_mask = dap.scatter(msa_mask, axis=2)

        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])
        msa_mask = paddle.transpose(msa_mask, [0, 2, 1])

        bias = 1e9 * (msa_mask - 1.)
        bias = paddle.unsqueeze(bias, axis=[2, 3])

        msa_mask = paddle.unsqueeze(msa_mask, axis=-1)
        msa_act = self.query_norm(msa_act)

        if not self.training:
            # low memory mode using subbatch
            sb_attn = subbatch(
                self.attention, [0, 1, 2], [1, 1, 1],
                self.global_config.subbatch_size,
                1,
                same_arg_idx={1: 0})
            msa_act = sb_attn(msa_act, msa_act, msa_mask)
        else:
            msa_act = self.attention(msa_act, msa_act, msa_mask)

        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])
        return msa_act


class MSAColumnAttention(nn.Layer):
    """MSA per-column attention.

    Jumper et al. (2021) Suppl. Alg. 8 "MSAColumnAttention"
    """

    def __init__(self, channel_num, config, global_config):
        super(MSAColumnAttention, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config
        assert config.orientation == 'per_column'

        msa_channel = channel_num['msa_channel']
        self.query_norm = nn.LayerNorm(msa_channel)
        self.attention = Attention(self.config, self.global_config,
                                   msa_channel, msa_channel, msa_channel)

    def forward(self, msa_act, msa_mask):
        """MSAColumnAttention.
        
        Args:
            msa_act (float): A tensor of msa_act.
            msa_mask (float): A tensor of msa_mask.

        Returns:
            A float32 tensor of msa_act.
        """

        # scatter if using dap, otherwise do nothing
        # [B, N_seq, N_res] => [B, N_seq, N_res//dap_size]
        msa_mask = dap.scatter(msa_mask, axis=2)

        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])
        msa_mask = paddle.transpose(msa_mask, [0, 2, 1])

        bias = 1e9 * (msa_mask - 1.)
        bias = paddle.unsqueeze(bias, axis=[2, 3])

        msa_act = self.query_norm(msa_act)
        if not self.training:
            # low memory mode using subbatch
            sb_attn = subbatch(
                self.attention, [0, 1, 2], [1, 1, 1],
                self.global_config.subbatch_size,
                1,
                same_arg_idx={1: 0})
            msa_act = sb_attn(msa_act, msa_act, bias)
        else:
            msa_act = self.attention(msa_act, msa_act, bias)

        msa_act = paddle.transpose(msa_act, [0, 2, 1, 3])
        return msa_act


class TriangleAttention(nn.Layer):
    """Triangle Attention.

    Jumper et al. (2021) Suppl. Alg. 13 "TriangleAttentionStartingNode"
    Jumper et al. (2021) Suppl. Alg. 14 "TriangleAttentionEndingNode"
    """

    def __init__(self,
                 channel_num,
                 config,
                 global_config,
                 name='triangle_attention'):
        super(TriangleAttention, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config

        assert config.orientation in ['per_row', 'per_column']

        self.query_norm = nn.LayerNorm(
            channel_num['pair_channel'], name='query_norm')
        self.feat_2d_weights = paddle.create_parameter(
            [channel_num['pair_channel'], self.config.num_head],
            'float32',
            default_initializer=nn.initializer.Normal(
                std=1. / np.sqrt(channel_num['pair_channel'])))

        self.attention = Attention(
            self.config, self.global_config, channel_num['pair_channel'],
            channel_num['pair_channel'], channel_num['pair_channel'])

    def forward(self, pair_act, pair_mask):
        """Builds TriangleAttention module.

        Args:
            pair_act (float): [batch, N_res, N_res, c_z] pair activations tensor
            pair_mask (float): [batch, N_res, N_res] mask of non-padded regions in the tensor.

        Returns:
            Update to pair_act, shape [batch, N_res, N_res, c_z].
        """
        if self.config.orientation == 'per_column':
            pair_act = pair_act.transpose([0, 2, 1, 3])
            pair_mask = pair_mask.transpose([0, 2, 1])

        # [B, N_res//dap_size, N_res]
        bias = 1e9 * (pair_mask - 1.)
        # [B, N_res//dap_size, 1, 1, N_res]
        bias = paddle.unsqueeze(bias, axis=[2, 3])

        pair_act = self.query_norm(pair_act)

        # [B, N_res//dap_size, N_res, cz], [cz, head] => [B, head, N_res//dap_size, N_res]
        nonbatched_bias_before = paddle.einsum('bqkc,ch->bhqk', pair_act,
                                               self.feat_2d_weights)

        # # [B, head, N_res//dap_size, N_res] => [B, head, N_res, N_res]
        nonbatched_bias = dap.all_gather(nonbatched_bias_before, axis=2)
        # if not self.training:
        if not self.training and self.global_config.low_memory is True:
            del nonbatched_bias_before
            gc.collect()
        nonbatched_bias = dap.all_gather_opp(nonbatched_bias, axis=2)

        if not self.training:
            # low memory mode using subbatch
            sb_attn = subbatch(
                self.attention, [0, 1, 2], [1, 1, 1],
                self.global_config.subbatch_size,
                1,
                same_arg_idx={1: 0})
            pair_act = sb_attn(pair_act, pair_act, bias, nonbatched_bias)
        else:
            pair_act = self.attention(pair_act, pair_act, bias,
                                      nonbatched_bias)

        if self.config.orientation == 'per_column':
            pair_act = pair_act.transpose([0, 2, 1, 3])

        return pair_act


class TriangleMultiplication(nn.Layer):
    """Triangle multiplication layer ("outgoing" or "incoming").

    Jumper et al. (2021) Suppl. Alg. 11 "TriangleMultiplicationOutgoing"
    Jumper et al. (2021) Suppl. Alg. 12 "TriangleMultiplicationIncoming"
    """

    def __init__(self,
                 channel_num,
                 config,
                 global_config,
                 name='triangle_multiplication'):
        super(TriangleMultiplication, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config

        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear

        self.layer_norm_input = nn.LayerNorm(
            self.channel_num['pair_channel'], name='layer_norm_input')
        self.left_projection = Linear(
            self.channel_num['pair_channel'],
            self.config.num_intermediate_channel,
            name='left_projection')
        self.right_projection = Linear(
            self.channel_num['pair_channel'],
            self.config.num_intermediate_channel,
            name='right_projection')
        self.left_gate = Linear(
            self.channel_num['pair_channel'],
            self.config.num_intermediate_channel,
            name='left_gate')
        init_gate_linear(self.left_gate)
        self.right_gate = Linear(
            self.channel_num['pair_channel'],
            self.config.num_intermediate_channel,
            name='right_gate')
        init_gate_linear(self.right_gate)

        # line 4
        self.center_layer_norm = nn.LayerNorm(
            self.config.num_intermediate_channel, name='center_layer_norm')
        self.output_projection = Linear(
            self.config.num_intermediate_channel,
            self.channel_num['pair_channel'],
            name='output_projection')
        init_final_linear(self.output_projection)
        # line 3
        self.gating_linear = Linear(
            self.channel_num['pair_channel'],
            self.channel_num['pair_channel'],
            name='output_projection')
        init_gate_linear(self.gating_linear)

    def forward(self, act, mask):
        """Builds TriangleMultiplication module.

        Args:
            act (float): Pair activations, shape [batch, N_res, N_res, c_z]
            mask (float): Pair mask, shape [batch, N_res, N_res].

        Returns:
            Outputs, same shape/type as act.
        """
        # Outgoing [batch, N_res//dap_size, N_res] => [batch, N_res//dap_size, N_res, 1]
        # Incoming [batch, N_res, N_res//dap_size] => [batch, N_res, N_res//dap_size, 1] 
        mask = paddle.unsqueeze(mask, axis=-1)  # [batch, N_res, N_res, 1]

        # Outgoing [B, N_res//dap_size, N_res, c_z]
        # Incoming [B, N_res, N_res//dap_size, c_z]
        act = self.layer_norm_input(act)  # line 1

        # if not self.training:
        if not self.training and self.global_config.low_memory is True:
            # Note(GuoxiaWang): using inplace version to save memory(low_mem=True).
            left_proj_act = self.left_gate(act)
            left_proj_act.sigmoid_()
            left_proj_act.multiply_(self.left_projection(act))
            left_proj_act.multiply_(mask)

            right_proj_act_before = self.right_gate(act)
            right_proj_act_before.sigmoid_()
            right_proj_act_before.multiply_(self.right_projection(act))
            right_proj_act_before.multiply_(mask)

        else:
            # Outgoing [B, N_res//dap_size, N_res, c_z] => [B, N_res//dap_size, N_res, num_intermediate_channel]
            # Incoming [B, N_res, N_res//dap_size, c_z] => [B, N_res, N_res//dap_size, num_intermediate_channel]
            left_proj_act = mask * self.left_projection(act)
            right_proj_act = mask * self.right_projection(act)

            # Outgoing [B, N_res//dap_size, N_res, c_z] => [B, N_res//dap_size, N_res, num_intermediate_channel]
            # Incoming [B, N_res, N_res//dap_size, c_z] => [B, N_res, N_res//dap_size, num_intermediate_channel]
            left_gate_values = nn.functional.sigmoid(self.left_gate(act))
            right_gate_values = nn.functional.sigmoid(self.right_gate(act))

            # Outgoing [B, N_res//dap_size, N_res, num_intermediate_channel]
            # Incoming [B, N_res, N_res//dap_size, num_intermediate_channel]
            left_proj_act = left_proj_act * left_gate_values
            right_proj_act_before = right_proj_act * right_gate_values

        # "Outgoing" edges equation: 'ikc,jkc->ijc'
        # "Incoming" edges equation: 'kjc,kic->ijc'
        # Note on the Suppl. Alg. 11 & 12 notation:
        # For the "outgoing" edges, a = left_proj_act and b = right_proj_act
        # For the "incoming" edges, it's swapped:
        #   b = left_proj_act and a = right_proj_act

        if self.config.equation == 'ikc,jkc->ijc':
            # Outgoing
            # [B, N_res//dap_size, N_res, num_intermediate_channel] => [B, N_res, N_res, num_intermediate_channel]
            right_proj_act = dap.all_gather(right_proj_act_before, axis=1)
            # if not self.training:
            if not self.training and self.global_config.low_memory is True:
                del right_proj_act_before
                gc.collect()
        elif self.config.equation == 'kjc,kic->ijc':
            # Incoming
            # [B, N_res, N_res//dap_size, num_intermediate_channel] => [B, N_res, N_res, num_intermediate_channel]
            right_proj_act = dap.all_gather(right_proj_act_before, axis=2)
            # if not self.training:
            if not self.training and self.global_config.low_memory is True:
                del right_proj_act_before
                gc.collect()
        else:
            raise ValueError('unknown equation.')

        # Outgoing [B, N_res//dap_size, N_res, c_z]
        # Incoming [B, N_res, N_res//dap_size, c_z]        

        # if not self.training:
        if not self.training and self.global_config.low_memory is True:
            gate_values = self.gating_linear(act).sigmoid_()  # line 3
        else:
            gate_values = nn.functional.sigmoid(
                self.gating_linear(act))  # line 3

        if self.config.equation == 'ikc,jkc->ijc':
            # Outgoing
            dim, out_idx = 1, 1
            equation = 'bikc,bjkc->bijc'

            # [B, N_res, N_res, num_intermediate_channel]
            right_proj_act_after = dap.all_gather_opp(right_proj_act, axis=1)
        elif self.config.equation == 'kjc,kic->ijc':
            # Incoming
            dim, out_idx = 2, 2
            equation = 'bkjc,bkic->bijc'

            # [B, N_res, N_res, num_intermediate_channel]
            right_proj_act_after = dap.all_gather_opp(right_proj_act, axis=2)
        else:
            raise ValueError('unknown equation.')

        if not self.training:
            einsum_fn = subbatch(paddle.einsum, [1], [dim],
                                 self.global_config.subbatch_size, out_idx)
            act = einsum_fn(equation, left_proj_act, right_proj_act_after)
        else:
            # Outgoing equation = 'bikc,bjkc->bijc'
            # [B, N_res//dap_size, N_res, num_intermediate_channel], [B, N_res, N_res, num_intermediate_channel]
            # => [B, N_res//dap_size, N_res, num_intermediate_channel]

            # Incoming equation = 'bkjc,bkic->bijc'
            # [B, N_res, N_res//dap_size, num_intermediate_channel], [B, N_res, N_res, num_intermediate_channel]
            # => [B, N_res, N_res//dap_size, num_intermediate_channel]
            act = paddle.einsum(equation, left_proj_act, right_proj_act_after)

        act = self.center_layer_norm(act)
        act = self.output_projection(act)

        act = act * gate_values

        return act


================================================
FILE: ppfleetx/models/protein_folding/common.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import functools
import numbers
import collections
import paddle
import paddle.nn as nn
from paddle.distributed.fleet.utils import recompute

try:
    from paddle import _legacy_C_ops as _C_ops
except:
    from paddle import _C_ops


def set_tensor_constant(tensor, constant):
    tensor.set_value(paddle.full_like(tensor, constant))


def init_gate_linear(linear):
    set_tensor_constant(linear.weight, 0)
    set_tensor_constant(linear.bias, 1)


def init_final_linear(linear):
    set_tensor_constant(linear.weight, 0)


def recompute_wrapper(func, *args, is_recompute=True):
    """Function wrapper for recompute"""
    if is_recompute:
        return recompute(func, *args)
    else:
        return func(*args)


def subbatch(f, arg_idx, dim, bs, out_idx, same_arg_idx={}):
    """ Converts a function to one that applies to subbatch of an input
    dimension.
    Args:
        f(Callable): original function.
        arg_idx([int]): indices of the inputs to be subbatched.
        dim([int]): index of the dimension to be subbatched.
        bs(int): subbatch size.
        out_idx(int): index of the output dimension that needs stacking
        same_arg_idx(dict), optional: index of same arg mapping. e.g {1: 0} means arg[1] == arg[0],
                            we assign _args[1] = _args[0] avoiding slice repeatly.
    Returns:
        converted function.
    """

    @functools.wraps(f)
    def wrapper(*args, **kwargs):

        assert len(arg_idx) == len(
            dim
        ), f'Number of batching args and number of batching dims should match.'

        inps = [args[i] for i in arg_idx]
        dim_width = [inp.shape[d] for inp, d in zip(inps, dim)]
        assert len(set(dim_width)) == 1, f'Batch sizes should be kept equal.'

        inp_dim = {inp: d for inp, d in zip(inps, dim)}

        dim_width = dim_width[0]
        if dim_width < bs:
            return f(*args, **kwargs)

        outs = []
        for slice_at in np.arange(0, dim_width, bs):
            _args = []
            for i, inp in enumerate(args):
                if i in same_arg_idx:
                    assert i > same_arg_idx[
                        i], f"expect i > same_arg_idx[i], but got i: {i} and same_arg_idx[i]: {same_arg_idx[i]}"
                    _args.append(_args[same_arg_idx[i]])
                elif i in arg_idx:
                    inp = inp.slice([inp_dim[inp]], [slice_at],
                                    [slice_at + bs])
                    _args.append(inp)
                else:
                    _args.append(inp)
            outs.append(f(*_args, **kwargs))

        return paddle.concat(outs, out_idx)

    return wrapper


def batched_gather(params, indices, axis=0, batch_dims=0):
    # Implement gather with batching, like tensorflow:
    # https://www.tensorflow.org/api_docs/python/tf/gather#batching
    # print(params.shape, indices.shape, axis)
    p, i = params, indices
    rank = len(p.shape)
    axis = (rank + axis) % rank
    # The stride of axis
    stride = p.shape[batch_dims + axis]

    if batch_dims == 0 and len(i.shape) == 1:
        return paddle.gather(p, i, axis=axis)

    elif batch_dims == 0:
        flat_i = i.reshape([-1])
        gathered = paddle.gather(p, flat_i, axis=axis)
        shape = p.shape[:axis] + i.shape
        if axis < rank - 1:
            shape += params.shape[axis + 1:]
        return gathered.reshape(shape)

    b = batch_dims
    a = axis
    assert p.shape[:b] == i.shape[:b]
    bn = np.prod(p.shape[:b])

    # Shift batch dimensions right to bundle with axis
    if a > 0:
        perm = list(range(rank))
        perm = perm[b:(b + a)] + perm[:b] + perm[(b + a):]
        p = p.transpose(perm)

    # Merge params' batch+axis
    p = p.reshape(p.shape[:a] + [-1] + p.shape[(b + a + 1):])

    # indices = [Batch..., Index...]
    # Expand the index values across batch elements
    strides = paddle.arange(bn, dtype="int64").unsqueeze(-1) * stride
    i = i.reshape([bn, -1])
    flat_i = paddle.flatten(i + strides)

    # Do gather
    gathered = paddle.gather(p, flat_i, axis=axis)

    # Unbundle batch and index dimensions
    unbundled_shape = p.shape[:a] + indices.shape + p.shape[a + 1:]
    gathered = gathered.reshape(unbundled_shape)

    # Shift batch dimensions back to the left
    if a > 0:
        perm = list(range(len(unbundled_shape)))
        perm = perm[a:(a + b)] + perm[:a] + perm[(a + b):]
        gathered = gathered.transpose(perm)

    return gathered


def mask_mean(mask, value, axis=None, drop_mask_channel=False, eps=1e-10):
    if drop_mask_channel:
        mask = mask[:, 0]

    mask_shape = mask.shape
    value_shape = value.shape
    assert len(mask_shape) == len(value_shape)

    if isinstance(axis, numbers.Integral):
        axis = [axis]
    elif axis is None:
        axis = list(range(len(mask_shape)))

    assert isinstance(axis, collections.abc.Iterable), \
        'axis needs to be either an iterable, integer or "None"'

    broadcast_factor = 1.
    for axis_ in axis:
        value_size = value_shape[axis_]
        mask_size = mask_shape[axis_]
        if mask_size == 1:
            broadcast_factor *= value_size
        else:
            assert mask_size == value_size

    return (paddle.sum(mask * value, axis=axis) /
            (paddle.sum(mask, axis=axis) * broadcast_factor + eps))


class Transition(nn.Layer):
    """Transition layer.

    Jumper et al. (2021) Suppl. Alg. 9 "MSATransition"
    Jumper et al. (2021) Suppl. Alg. 15 "PairTransition"
    """

    def __init__(self, channel_num, config, global_config, is_extra_msa,
                 transition_type):
        super(Transition, self).__init__()
        assert transition_type in ['msa_transition', 'pair_transition']
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config
        self.is_extra_msa = is_extra_msa
        self.transition_type = transition_type

        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear

        if transition_type == 'msa_transition' and is_extra_msa:
            in_dim = channel_num['extra_msa_channel']
        elif transition_type == 'msa_transition' and not is_extra_msa:
            in_dim = channel_num['msa_channel']
        elif transition_type == 'pair_transition':
            in_dim = channel_num['pair_channel']

        self.input_layer_norm = nn.LayerNorm(in_dim)
        self.transition1 = Linear(
            in_dim,
            int(in_dim * self.config.num_intermediate_factor),
            weight_attr=paddle.ParamAttr(
                initializer=nn.initializer.KaimingNormal()))

        if self.global_config.zero_init:
            last_init = nn.initializer.Constant(0.0)
        else:
            last_init = nn.initializer.TruncatedNormal()

        self.transition2 = Linear(
            int(in_dim * self.config.num_intermediate_factor),
            in_dim,
            weight_attr=paddle.ParamAttr(initializer=last_init))

    def forward(self, act, mask):
        act = self.input_layer_norm(act)

        def transition_module(x):
            x = self.transition1(x)
            x = nn.functional.relu(x)
            x = self.transition2(x)
            return x

        if not self.training:
            # low memory mode using subbatch
            sb_transition = subbatch(transition_module, [0], [1],
                                     self.global_config.subbatch_size, 1)
            act = sb_transition(act)
        else:
            act = transition_module(act)

        return act


class Dropout(nn.Layer):
    def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
        super(Dropout, self).__init__()

        if not isinstance(p, (float, int)):
            raise TypeError("p argument should be a number")
        if p < 0 or p > 1:
            raise ValueError("p argument should between 0 and 1")

        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
        if mode not in ('downscale_in_infer', 'upscale_in_train'):
            raise ValueError(
                "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
            )

        if axis and not isinstance(axis, (int, list, tuple)):
            raise TypeError("datatype of axis argument should be int or list")

        self.p = p
        self.axis = axis
        self.mode = mode
        self.name = name

    def forward(self, input):
        # fast return for p == 0
        if self.p == 0:
            return input

        if self.axis == None:
            out = nn.functional.dropout(
                input,
                p=self.p,
                axis=self.axis,
                training=self.training,
                mode=self.mode,
                name=self.name)
        else:
            seed = None
            drop_axes = [self.axis] if isinstance(self.axis,
                                                  int) else list(self.axis)
            if paddle.static.default_main_program().random_seed != 0:
                seed = paddle.static.default_main_program().random_seed

            out, mask = _C_ops.dropout_nd(
                input, 'dropout_prob', self.p, 'is_test', not self.training,
                'fix_seed', seed is not None, 'seed', seed if seed is not None
                else 0, 'dropout_implementation', self.mode, 'axis', drop_axes)

        return out

    def extra_repr(self):
        name_str = ', name={}'.format(self.name) if self.name else ''
        return 'p={}, axis={}, mode={}{}'.format(self.p, self.axis, self.mode,
                                                 name_str)


def dgram_from_positions(positions, num_bins, min_bin, max_bin):
    lower_breaks = paddle.linspace(min_bin, max_bin, num_bins)
    lower_breaks = paddle.square(lower_breaks)
    upper_breaks = paddle.concat([
        lower_breaks[1:], paddle.full(
            shape=[1], fill_value=1e8, dtype='float32')
    ])

    def _squared_difference(x, y):
        return paddle.square(x - y)

    dist2 = paddle.sum(_squared_difference(
        paddle.unsqueeze(
            positions, axis=-2),
        paddle.unsqueeze(
            positions, axis=-3)),
                       axis=-1,
                       keepdim=True)

    dgram = ((dist2 > lower_breaks.astype(dist2.dtype)).astype('float32') *
             (dist2 < upper_breaks.astype(dist2.dtype)).astype('float32'))
    return dgram


================================================
FILE: ppfleetx/models/protein_folding/evoformer.py
================================================
"""evoformer.py."""
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import paddle
import paddle.nn as nn

from ppfleetx.distributed.protein_folding import bp, dap

from .attentions import (
    MSARowAttentionWithPairBias,
    MSAColumnGlobalAttention,
    MSAColumnAttention,
    TriangleMultiplication,
    TriangleAttention, )

from .common import (
    Transition,
    Dropout,
    recompute_wrapper,
    dgram_from_positions, )

from .template import (TemplateEmbedding, )
from .outer_product_mean import (OuterProductMean, )

from . import (
    residue_constants,
    all_atom, )


class EvoformerIteration(nn.Layer):
    """Single iteration (block) of Evoformer stack.

    Jumper et al. (2021) Suppl. Alg. 6 "EvoformerStack" lines 2-10
    """

    def __init__(self, channel_num, config, global_config, is_extra_msa=False):
        super(EvoformerIteration, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config
        self.is_extra_msa = is_extra_msa

        assert self.global_config.outer_product_mean_position in [
            'origin', 'middle', 'first', 'end'
        ]

        # Row-wise Gated Self-attention with Pair Bias
        self.msa_row_attention_with_pair_bias = MSARowAttentionWithPairBias(
            channel_num, self.config.msa_row_attention_with_pair_bias,
            self.global_config, is_extra_msa)
        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.msa_row_attention_with_pair_bias)
        self.msa_row_attn_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        if self.is_extra_msa:
            self.msa_column_global_attention = MSAColumnGlobalAttention(
                channel_num, config.msa_column_attention, global_config)
            dropout_rate, dropout_axis = self._parse_dropout_params(
                self.msa_column_global_attention)
            self.msa_col_attn_dropout = nn.Dropout(
                dropout_rate, axis=dropout_axis) \
                    if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)
        else:
            self.msa_column_attention = MSAColumnAttention(
                channel_num, config.msa_column_attention, global_config)
            dropout_rate, dropout_axis = self._parse_dropout_params(
                self.msa_column_attention)
            self.msa_col_attn_dropout = nn.Dropout(
                dropout_rate, axis=dropout_axis) \
                    if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.msa_transition = Transition(
            channel_num, self.config.msa_transition, self.global_config,
            is_extra_msa, 'msa_transition')
        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.msa_transition)
        self.msa_transition_dropout = nn.Dropout(
            dropout_rate, axis=dropout_axis) \
                if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        # OuterProductMean
        self.outer_product_mean = OuterProductMean(
            channel_num,
            self.config.outer_product_mean,
            self.global_config,
            self.is_extra_msa,
            name='outer_product_mean')

        # Dropout
        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.outer_product_mean)
        self.outer_product_mean_dropout = nn.Dropout(
            dropout_rate, axis=dropout_axis) \
                if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        # Triangle Multiplication.
        self.triangle_multiplication_outgoing = TriangleMultiplication(
            channel_num,
            self.config.triangle_multiplication_outgoing,
            self.global_config,
            name='triangle_multiplication_outgoing')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_multiplication_outgoing)
        self.triangle_outgoing_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.triangle_multiplication_incoming = TriangleMultiplication(
            channel_num,
            self.config.triangle_multiplication_incoming,
            self.global_config,
            name='triangle_multiplication_incoming')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_multiplication_incoming)
        self.triangle_incoming_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        # TriangleAttention.
        self.triangle_attention_starting_node = TriangleAttention(
            channel_num,
            self.config.triangle_attention_starting_node,
            self.global_config,
            name='triangle_attention_starting_node')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_attention_starting_node)
        self.triangle_starting_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.triangle_attention_ending_node = TriangleAttention(
            channel_num,
            self.config.triangle_attention_ending_node,
            self.global_config,
            name='triangle_attention_ending_node')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_attention_ending_node)
        self.triangle_ending_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        # Pair transition.
        self.pair_transition = Transition(
            channel_num, self.config.pair_transition, self.global_config,
            is_extra_msa, 'pair_transition')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.pair_transition)
        self.pair_transition_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

    def _parse_dropout_params(self, module):
        """tbd."""

        dropout_rate = 0.0 if self.global_config.deterministic else \
            module.config.dropout_rate
        dropout_axis = None
        if module.config.shared_dropout:
            dropout_axis = {
                'per_row': [0, 2, 3],
                'per_column': [0, 1, 3],
            }[module.config.orientation]

        return dropout_rate, dropout_axis

    def outer_product_mean_origin(self, msa_act, pair_act, masks):
        """tbd."""

        assert bp.get_world_size(
        ) == 1, "Branch Parallel degree must be 1 for outer_product_mean_origin"

        msa_mask, pair_mask = masks['msa'], masks['pair']

        # [B, N_seq//dap_size, N_res, c_m]
        residual = self.msa_row_attention_with_pair_bias(msa_act, msa_mask,
                                                         pair_act)
        residual = self.msa_row_attn_dropout(residual)
        msa_act = msa_act + residual

        # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m]
        msa_act = dap.row_to_col(msa_act)

        if self.is_extra_msa:
            # [B, N_seq, N_res//dap_size, c_m]
            residual = self.msa_column_global_attention(msa_act, msa_mask)
            residual = self.msa_col_attn_dropout(residual)
            msa_act = msa_act + residual

            # [B, N_seq, N_res//dap_size, c_m]
            residual = self.msa_transition(msa_act, msa_mask)
            residual = self.msa_transition_dropout(residual)
            msa_act = msa_act + residual

        else:
            # [B, N_seq, N_res//dap_size, c_m]
            residual = self.msa_column_attention(msa_act, msa_mask)
            residual = self.msa_col_attn_dropout(residual)
            msa_act = msa_act + residual

            # [B, N_seq, N_res//dap_size, c_m]
            residual = self.msa_transition(msa_act, msa_mask)
            residual = self.msa_transition_dropout(residual)
            msa_act = msa_act + residual

        # [B, N_res//dap_size, N_res, c_z]
        residual = self.outer_product_mean(msa_act, msa_mask)
        outer_product_mean = self.outer_product_mean_dropout(residual)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            pair_act.add_(outer_product_mean)
            del outer_product_mean
            gc.collect()
        else:
            pair_act = pair_act + outer_product_mean

        # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m]
        msa_act = dap.col_to_row(msa_act)

        # scatter if using dap, otherwise do nothing
        pair_mask_row = dap.scatter(pair_mask, axis=1)
        pair_mask_col = dap.scatter(pair_mask, axis=2)

        # [B, N_res//dap_size, N_res, c_z]
        # TODO(GuoxiaWang): why have diffrence whether remove pair_act = pair_act.clone()
        # pair_act = pair_act.clone()
        residual = self.triangle_multiplication_outgoing(pair_act,
                                                         pair_mask_row)
        residual = self.triangle_outgoing_dropout(residual)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            pair_act.add_(residual)
            del residual
            gc.collect()
        else:
            pair_act = pair_act + residual

        # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]
        pair_act = dap.row_to_col(pair_act)
        # [B, N_res, N_res//dap_size, c_z]
        residual = self.triangle_multiplication_incoming(pair_act,
                                                         pair_mask_col)
        residual = self.triangle_incoming_dropout(residual)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            pair_act.add_(residual)
            del residual
            gc.collect()
        else:
            pair_act = pair_act + residual

        # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
        pair_act = dap.col_to_row(pair_act)
        # [B, N_res//dap_size, N_res, c_z]
        residual = self.triangle_attention_starting_node(pair_act,
                                                         pair_mask_row)
        residual = self.triangle_starting_dropout(residual)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            pair_act.add_(residual)
            del residual
            gc.collect()
        else:
            pair_act = pair_act + residual

        # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]
        pair_act = dap.row_to_col(pair_act)
        # [B, N_res, N_res//dap_size, c_z]
        residual = self.triangle_attention_ending_node(pair_act, pair_mask_col)
        residual = self.triangle_ending_dropout(residual)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            pair_act.add_(residual)
            del residual
            gc.collect()
        else:
            pair_act = pair_act + residual

        residual = self.pair_transition(pair_act, pair_mask)
        residual = self.pair_transition_dropout(residual)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            pair_act.add_(residual)
            del residual
            gc.collect()
        else:
            pair_act = pair_act + residual

        # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
        pair_act = dap.col_to_row(pair_act)

        return msa_act, pair_act

    def outer_product_mean_first(self, msa_act, pair_act, masks):
        """tbd."""

        raise NotImplementedError(
            "BP or DAP does not support outer_product_mean_first")

    def outer_product_mean_end(self, msa_act, pair_act, masks):
        """tbd."""

        msa_mask, pair_mask = masks['msa'], masks['pair']

        if bp.get_world_size() > 1:
            # Note(GuoxiaWang): add zeros trigger the status of stop_gradient=False within recompute context.
            pair_act = pair_act + paddle.zeros_like(pair_act)

            # Note(GuoxiaWang): reduce the pair_act's gradient from msa branch and pair branch
            if not pair_act.stop_gradient:
                pair_act._register_grad_hook(bp.all_reduce)

            if bp.get_rank_in_group() == 0:
                # [B, N_seq//dap_size, N_res, c_m]
                residual = self.msa_row_attention_with_pair_bias(
                    msa_act, msa_mask, pair_act)
                residual = self.msa_row_attn_dropout(residual)
                msa_act = msa_act + residual

                # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m]
                msa_act = dap.row_to_col(msa_act)

                if self.is_extra_msa:
                    # [B, N_seq, N_res//dap_size, c_m]
                    residual = self.msa_column_global_attention(msa_act,
                                                                msa_mask)
                    residual = self.msa_col_attn_dropout(residual)
                    msa_act = msa_act + residual

                    # [B, N_seq, N_res//dap_size, c_m]
                    residual = self.msa_transition(msa_act, msa_mask)
                    residual = self.msa_transition_dropout(residual)
                    msa_act = msa_act + residual

                else:
                    # [B, N_seq, N_res//dap_size, c_m]
                    residual = self.msa_column_attention(msa_act, msa_mask)
                    residual = self.msa_col_attn_dropout(residual)
                    msa_act = msa_act + residual

                    # [B, N_seq, N_res//dap_size, c_m]
                    residual = self.msa_transition(msa_act, msa_mask)
                    residual = self.msa_transition_dropout(residual)
                    msa_act = msa_act + residual

                # [B, N_res//dap_size, N_res, c_z]
                residual = self.outer_product_mean(msa_act, msa_mask)
                outer_product_mean = self.outer_product_mean_dropout(residual)

                # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m]
                msa_act = dap.col_to_row(msa_act)

            if bp.get_rank_in_group() == 1:
                # scatter if using dap, otherwise do nothing
                pair_mask_row = dap.scatter(pair_mask, axis=1)
                pair_mask_col = dap.scatter(pair_mask, axis=2)

                # [B, N_res//dap_size, N_res, c_z]
                residual = self.triangle_multiplication_outgoing(pair_act,
                                                                 pair_mask_row)
                residual = self.triangle_outgoing_dropout(residual)
                pair_act = pair_act + residual

                # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]
                pair_act = dap.row_to_col(pair_act)
                # [B, N_res, N_res//dap_size, c_z]
                residual = self.triangle_multiplication_incoming(pair_act,
                                                                 pair_mask_col)
                residual = self.triangle_incoming_dropout(residual)
                pair_act = pair_act + residual

                # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
                pair_act = dap.col_to_row(pair_act)
                # [B, N_res//dap_size, N_res, c_z]
                residual = self.triangle_attention_starting_node(pair_act,
                                                                 pair_mask_row)
                residual = self.triangle_starting_dropout(residual)
                pair_act = pair_act + residual

                # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]
                pair_act = dap.row_to_col(pair_act)
                # [B, N_res, N_res//dap_size, c_z]
                residual = self.triangle_attention_ending_node(pair_act,
                                                               pair_mask_col)
                residual = self.triangle_ending_dropout(residual)
                pair_act = pair_act + residual

                residual = self.pair_transition(pair_act, pair_mask)
                residual = self.pair_transition_dropout(residual)
                pair_act = pair_act + residual

                # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
                pair_act = dap.col_to_row(pair_act)

                outer_product_mean = paddle.zeros_like(pair_act)
                outer_product_mean.stop_gradient = pair_act.stop_gradient

            # TODO(GuoxiaWang): fix PyLayer ctx illegal access
            msa_act = paddle.assign(msa_act)
            pair_act = paddle.assign(pair_act)

            msa_act, pair_act = bp.sync_evoformer_results(outer_product_mean,
                                                          msa_act, pair_act)
            # TODO(GuoxiaWang): fix PyLayer ctx illegal access
            pair_act = paddle.assign(pair_act)
            return msa_act, pair_act

        else:
            # [B, N_seq//dap_size, N_res, c_m]
            residual = self.msa_row_attention_with_pair_bias(msa_act, msa_mask,
                                                             pair_act)
            residual = self.msa_row_attn_dropout(residual)
            msa_act = msa_act + residual

            # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m]
            msa_act = dap.row_to_col(msa_act)

            if self.is_extra_msa:
                # [B, N_seq, N_res//dap_size, c_m]
                residual = self.msa_column_global_attention(msa_act, msa_mask)
                residual = self.msa_col_attn_dropout(residual)
                msa_act = msa_act + residual

                # [B, N_seq, N_res//dap_size, c_m]
                residual = self.msa_transition(msa_act, msa_mask)
                residual = self.msa_transition_dropout(residual)
                msa_act = msa_act + residual

            else:
                # [B, N_seq, N_res//dap_size, c_m]
                residual = self.msa_column_attention(msa_act, msa_mask)
                residual = self.msa_col_attn_dropout(residual)
                msa_act = msa_act + residual

                # [B, N_seq, N_res//dap_size, c_m]
                residual = self.msa_transition(msa_act, msa_mask)
                residual = self.msa_transition_dropout(residual)
                msa_act = msa_act + residual

            # [B, N_res//dap_size, N_res, c_z]
            residual = self.outer_product_mean(msa_act, msa_mask)
            outer_product_mean = self.outer_product_mean_dropout(residual)

            # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m]
            msa_act = dap.col_to_row(msa_act)

            # scatter if using dap, otherwise do nothing
            pair_mask_row = dap.scatter(pair_mask, axis=1)
            pair_mask_col = dap.scatter(pair_mask, axis=2)

            # [B, N_res//dap_size, N_res, c_z]
            # TODO(GuoxiaWang): why have diffrence whether remove pair_act = pair_act.clone()
            # pair_act = pair_act.clone()
            residual = self.triangle_multiplication_outgoing(pair_act,
                                                             pair_mask_row)
            residual = self.triangle_outgoing_dropout(residual)
            pair_act = pair_act + residual

            # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]
            pair_act = dap.row_to_col(pair_act)
            # [B, N_res, N_res//dap_size, c_z]
            residual = self.triangle_multiplication_incoming(pair_act,
                                                             pair_mask_col)
            residual = self.triangle_incoming_dropout(residual)
            pair_act = pair_act + residual

            # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
            pair_act = dap.col_to_row(pair_act)
            # [B, N_res//dap_size, N_res, c_z]
            residual = self.triangle_attention_starting_node(pair_act,
                                                             pair_mask_row)
            residual = self.triangle_starting_dropout(residual)
            pair_act = pair_act + residual

            # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z]
            pair_act = dap.row_to_col(pair_act)
            # [B, N_res, N_res//dap_size, c_z]
            residual = self.triangle_attention_ending_node(pair_act,
                                                           pair_mask_col)
            residual = self.triangle_ending_dropout(residual)
            pair_act = pair_act + residual

            residual = self.pair_transition(pair_act, pair_mask)
            residual = self.pair_transition_dropout(residual)
            pair_act = pair_act + residual

            # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
            pair_act = dap.col_to_row(pair_act)

            pair_act = pair_act + outer_product_mean

            return msa_act, pair_act

    def forward(self, msa_act, pair_act, masks):
        """tbd."""

        if self.global_config.outer_product_mean_position in [
                'origin', 'middle'
        ]:
            msa_act, pair_act = self.outer_product_mean_origin(msa_act,
                                                               pair_act, masks)

        elif self.global_config.outer_product_mean_position == 'first':
            msa_act, pair_act = self.outer_product_mean_first(msa_act,
                                                              pair_act, masks)

        elif self.global_config.outer_product_mean_position == 'end':
            msa_act, pair_act = self.outer_product_mean_end(msa_act, pair_act,
                                                            masks)

        else:
            raise Error(
                "Only support outer_product_mean_position in ['origin', 'middle', ''first', 'end'] now!"
            )

        return msa_act, pair_act


class DistEmbeddingsAndEvoformer(nn.Layer):
    """Embeds the input data and runs Evoformer.

    Produces the MSA, single and pair representations.
    Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5-18
    """

    def __init__(self, channel_num, config, global_config):
        super(DistEmbeddingsAndEvoformer, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config

        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear

        # InputEmbedder
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5
        # Jumper et al. (2021) Suppl. Alg. 3 "InputEmbedder"
        self.preprocess_1d = Linear(
            channel_num['target_feat'],
            self.config.msa_channel,
            name='preprocess_1d')
        self.preprocess_msa = Linear(
            channel_num['msa_feat'],
            self.config.msa_channel,
            name='preprocess_msa')
        self.left_single = Linear(
            channel_num['target_feat'],
            self.config.pair_channel,
            name='left_single')
        self.right_single = Linear(
            channel_num['target_feat'],
            self.config.pair_channel,
            name='right_single')

        # RecyclingEmbedder
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 6
        # Jumper et al. (2021) Suppl. Alg. 32 "RecyclingEmbedder"
        if self.config.recycle_pos:
            self.prev_pos_linear = Linear(self.config.prev_pos.num_bins,
                                          self.config.pair_channel)

        # RelPosEmbedder
        # Jumper et al. (2021) Suppl. Alg. 4 "relpos"
        # Jumper et al. (2021) Suppl. Alg. 5 "one_hot"
        if self.config.max_relative_feature:
            self.pair_activiations = Linear(
                2 * self.config.max_relative_feature + 1,
                self.config.pair_channel)

        if self.config.recycle_features:
            self.prev_msa_first_row_norm = nn.LayerNorm(
                self.config.msa_channel)
            self.prev_pair_norm = nn.LayerNorm(self.config.pair_channel)

        # Embed templates into the pair activations.
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-13
        if self.config.template.enabled:
            self.channel_num['template_angle'] = 57
            self.channel_num['template_pair'] = 88
            self.template_embedding = TemplateEmbedding(
                self.channel_num, self.config.template, self.global_config)

        # ExtraMSAEmbedder
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 14-16
        self.extra_msa_activations = Linear(
            25,  # 23 (20aa+unknown+gap+mask) + 1 (has_del) + 1 (del_val)
            self.config.extra_msa_channel)

        # Extra MSA Stack.
        # Jumper et al. (2021) Suppl. Alg. 18 "ExtraMsaStack"
        self.extra_msa_stack = nn.LayerList()
        for _ in range(self.config.extra_msa_stack_num_block):
            self.extra_msa_stack.append(
                EvoformerIteration(
                    self.channel_num,
                    self.config.evoformer,
                    self.global_config,
                    is_extra_msa=True))

        # Embed templates torsion angles
        if self.config.template.enabled and self.config.template.embed_torsion_angles:
            c = self.config.msa_channel
            self.template_single_embedding = Linear(
                self.channel_num['template_angle'], c)
            self.template_projection = Linear(c, c)

        # Main trunk of the network
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 17-18
        self.evoformer_iteration = nn.LayerList()
        for _ in range(self.config.evoformer_num_block):
            self.evoformer_iteration.append(
                EvoformerIteration(
                    self.channel_num,
                    self.config.evoformer,
                    self.global_config,
                    is_extra_msa=False))

        self.single_activations = Linear(self.config.msa_channel,
                                         self.config.seq_channel)

    def _pseudo_beta_fn(self, aatype, all_atom_positions, all_atom_masks):
        """tbd."""

        gly_id = paddle.ones_like(aatype) * residue_constants.restype_order[
            'G']
        is_gly = paddle.equal(aatype, gly_id)

        ca_idx = residue_constants.atom_order['CA']
        cb_idx = residue_constants.atom_order['CB']

        n = len(all_atom_positions.shape)
        pseudo_beta = paddle.where(
            paddle.tile(
                paddle.unsqueeze(
                    is_gly, axis=-1), [1] * len(is_gly.shape) + [3]),
            paddle.squeeze(
                all_atom_positions.slice([n - 2], [ca_idx], [ca_idx + 1]),
                axis=-2),
            paddle.squeeze(
                all_atom_positions.slice([n - 2], [cb_idx], [cb_idx + 1]),
                axis=-2))

        if all_atom_masks is not None:
            m = len(all_atom_masks)
            pseudo_beta_mask = paddle.where(
                is_gly,
                paddle.squeeze(
                    all_atom_masks.slice([m - 1], [ca_idx], [ca_idx + 1]),
                    axis=-1),
                paddle.squeeze(
                    all_atom_masks.slice([m - 1], [cb_idx], [cb_idx + 1]),
                    axis=-1))
            pseudo_beta_mask = paddle.squeeze(pseudo_beta_mask, axis=-1)
            return pseudo_beta, pseudo_beta_mask
        else:
            return pseudo_beta

    def _create_extra_msa_feature(self, batch):
        """tbd."""

        # 23: 20aa + unknown + gap + bert mask
        msa_1hot = nn.functional.one_hot(batch['extra_msa'], 23)
        msa_feat = [
            msa_1hot, paddle.unsqueeze(
                batch['extra_has_deletion'], axis=-1), paddle.unsqueeze(
                    batch['extra_deletion_value'], axis=-1)
        ]
        return paddle.concat(msa_feat, axis=-1)

    def forward(self, batch):
        """tbd."""

        # InputEmbedder
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5
        # Jumper et al. (2021) Suppl. Alg. 3 "InputEmbedder"
        preprocess_1d = self.preprocess_1d(batch['target_feat'])
        # preprocess_msa = self.preprocess_msa(batch['msa_feat'])
        msa_activations = paddle.unsqueeze(preprocess_1d, axis=1) + \
                    self.preprocess_msa(batch['msa_feat'])

        right_single = self.right_single(
            batch['target_feat'])  # 1, n_res, 22 -> 1, n_res, 128
        right_single = paddle.unsqueeze(
            right_single, axis=1)  # 1, n_res, 128 -> 1, 1, n_res, 128
        left_single = self.left_single(
            batch['target_feat'])  # 1, n_res, 22 -> 1, n_res, 128
        left_single = paddle.unsqueeze(
            left_single, axis=2)  # 1, n_res, 128 -> 1, n_res, 1, 128
        pair_activations = left_single + right_single

        if not self.training and self.global_config.low_memory is True:
            del left_single
            del right_single
            gc.collect()

            # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z]
            pair_activations = dap.scatter(pair_activations, axis=1)

        mask_2d = paddle.unsqueeze(
            batch['seq_mask'], axis=1) * paddle.unsqueeze(
                batch['seq_mask'], axis=2)

        # Inject previous outputs for recycling.
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 6
        # Jumper et al. (2021) Suppl. Alg. 32 "RecyclingEmbedder"
        if self.config.recycle_pos and 'prev_pos' in batch:
            prev_pseudo_beta = self._pseudo_beta_fn(batch['aatype'],
                                                    batch['prev_pos'], None)
            dgram = dgram_from_positions(prev_pseudo_beta,
                                         **self.config.prev_pos)
            if not self.training and self.global_config.low_memory is True:
                dgram = dap.scatter(dgram, axis=1)
                pair_activations += self.prev_pos_linear(dgram)
                del dgram
                del prev_pseudo_beta
                gc.collect()
            else:
                pair_activations += self.prev_pos_linear(dgram)

        if self.config.recycle_features:
            if 'prev_msa_first_row' in batch:
                prev_msa_first_row = self.prev_msa_first_row_norm(batch[
                    'prev_msa_first_row'])

                # A workaround for `jax.ops.index_add`
                msa_first_row = paddle.squeeze(
                    msa_activations[:, 0, :], axis=1)
                msa_first_row += prev_msa_first_row
                msa_first_row = paddle.unsqueeze(msa_first_row, axis=1)
                msa_activations = paddle.concat(
                    [msa_first_row, msa_activations[:, 1:, :]], axis=1)
                # if not self.training:  # for inference
                if not self.training and self.global_config.low_memory is True:
                    del prev_msa_first_row
                    del msa_first_row
                    gc.collect()

            if 'prev_pair' in batch:
                # if not self.training:  # for inference
                if not self.training and self.global_config.low_memory is True:
                    prev_pair = batch['prev_pair']
                    prev_pair_gpu = prev_pair.cuda()
                    prev_pair_gpu = dap.scatter(prev_pair_gpu, axis=1)
                    pair_activations += self.prev_pair_norm(prev_pair_gpu)
                    del prev_pair_gpu
                    gc.collect()
                else:
                    pair_activations += self.prev_pair_norm(batch['prev_pair'])

        # RelPosEmbedder
        # Jumper et al. (2021) Suppl. Alg. 4 "relpos"
        # Jumper et al. (2021) Suppl. Alg. 5 "one_hot"
        if self.config.max_relative_feature:
            pos = batch['residue_index']  # [bs, N_res]
            offset = paddle.unsqueeze(pos, axis=[-1]) - \
                paddle.unsqueeze(pos, axis=[-2])
            rel_pos = nn.functional.one_hot(
                paddle.clip(
                    offset + self.config.max_relative_feature,
                    min=0,
                    max=2 * self.config.max_relative_feature),
                2 * self.config.max_relative_feature + 1)

            if not self.training and self.global_config.low_memory is True:
                rel_pos = dap.scatter(rel_pos, axis=1)
                rel_pos_bias = self.pair_activiations(rel_pos)
                pair_activations += rel_pos_bias
                del rel_pos
                del rel_pos_bias
                gc.collect()
            else:
                rel_pos_bias = self.pair_activiations(rel_pos)
                pair_activations += rel_pos_bias

        # TemplateEmbedder
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-13
        if self.config.template.enabled:
            template_batch = {
                k: batch[k]
                for k in batch if k.startswith('template_')
            }
            template_pair_repr = self.template_embedding(
                pair_activations, template_batch, mask_2d)
            pair_activations += template_pair_repr

        # ExtraMSAEmbedder
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 14-16
        extra_msa_feat = self._create_extra_msa_feature(batch)
        extra_msa_activations = self.extra_msa_activations(extra_msa_feat)
        # if not self.training:  # for inference
        if not self.training and self.global_config.low_memory is True:
            del extra_msa_feat
            gc.collect()

        # ==================================================
        #  Extra MSA Stack
        # Jumper et al. (2021) Suppl. Alg. 18 "ExtraMsaStack"
        # ==================================================

        if not self.training and self.global_config.low_memory is True:
            # scatter if using dap, otherwise do nothing
            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]
            extra_msa_activations = dap.scatter(extra_msa_activations, axis=1)
            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]
            msa_activations = dap.scatter(msa_activations, axis=1)

        extra_msa_stack_input = {
            'msa': extra_msa_activations,
            'pair': pair_activations,
        }

        if not self.training and self.global_config.low_memory is True:
            del pair_activations
            gc.collect()

        if bp.get_world_size() > 1:
            extra_msa_stack_input['msa'] = bp.broadcast_grad_for_backward(
                extra_msa_stack_input['msa'], 0)

        if not self.training and self.global_config.low_memory is True:
            pass
        else:
            # scatter if using dap, otherwise do nothing
            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]
            extra_msa_stack_input['msa'] = dap.scatter(
                extra_msa_stack_input['msa'], axis=1)
            # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z]
            extra_msa_stack_input['pair'] = dap.scatter(
                extra_msa_stack_input['pair'], axis=1)

        for idx, extra_msa_stack_iteration in enumerate(self.extra_msa_stack):
            extra_msa_act, extra_pair_act = recompute_wrapper(
                extra_msa_stack_iteration,
                extra_msa_stack_input['msa'],
                extra_msa_stack_input['pair'],
                {'msa': batch['extra_msa_mask'],
                 'pair': mask_2d},
                is_recompute=self.training and
                idx >= self.config.extra_msa_stack_recompute_start_block_index)
            extra_msa_stack_output = {
                'msa': extra_msa_act,
                'pair': extra_pair_act
            }
            extra_msa_stack_input = {
                'msa': extra_msa_stack_output['msa'],
                'pair': extra_msa_stack_output['pair']
            }

        if not self.training and self.global_config.low_memory is True:
            pass
        else:
            # gather if using dap, otherwise do nothing
            # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res, c_z]
            extra_msa_stack_output['pair'] = dap.gather(
                extra_msa_stack_output['pair'], axis=1)

        evoformer_input = {
            'msa': msa_activations,
            'pair': extra_msa_stack_output['pair'],
        }

        evoformer_masks = {
            'msa': batch['msa_mask'],
            'pair': mask_2d,
        }

        if not self.training and self.global_config.low_memory is True:
            del extra_msa_stack_input
            del extra_msa_stack_output
            gc.collect()

        # ==================================================
        #  Template angle feat
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 7-8
        # ==================================================
        if self.config.template.enabled and self.config.template.embed_torsion_angles:
            num_templ, num_res = batch['template_aatype'].shape[1:]

            aatype_one_hot = nn.functional.one_hot(batch['template_aatype'],
                                                   22)
            # Embed the templates aatype, torsion angles and masks.
            # Shape (templates, residues, msa_channels)
            ret = all_atom.atom37_to_torsion_angles(
                aatype=batch['template_aatype'],
                all_atom_pos=batch['template_all_atom_positions'],
                all_atom_mask=batch['template_all_atom_masks'],
                # Ensure consistent behaviour during testing:
                placeholder_for_undefined=not self.global_config.zero_init)

            template_features = paddle.concat(
                [
                    aatype_one_hot,
                    paddle.reshape(ret['torsion_angles_sin_cos'],
                                   [-1, num_templ, num_res, 14]),
                    paddle.reshape(ret['alt_torsion_angles_sin_cos'],
                                   [-1, num_templ, num_res, 14]),
                    ret['torsion_angles_mask']
                ],
                axis=-1)

            template_activations = self.template_single_embedding(
                template_features)
            template_activations = nn.functional.relu(template_activations)
            template_activations = self.template_projection(
                template_activations)

            # Concatenate the templates to the msa.
            evoformer_input['msa'] = paddle.concat(
                [evoformer_input['msa'], template_activations], axis=1)

            # Concatenate templates masks to the msa masks.
            # Use mask from the psi angle, as it only depends on the backbone atoms
            # from a single residue.
            torsion_angle_mask = ret['torsion_angles_mask'][..., 2]
            torsion_angle_mask = torsion_angle_mask.astype(evoformer_masks[
                'msa'].dtype)
            evoformer_masks['msa'] = paddle.concat(
                [evoformer_masks['msa'], torsion_angle_mask], axis=1)

        if bp.get_world_size() > 1:
            evoformer_input['msa'] = bp.broadcast_grad_for_backward(
                evoformer_input['msa'], 0)

        # if self.training:
        if not self.training and self.global_config.low_memory is True:
            pass
        else:
            # scatter if using dap, otherwise do nothing
            # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m]
            evoformer_input['msa'] = dap.scatter(
                evoformer_input['msa'], axis=1)
            # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z]
            evoformer_input['pair'] = dap.scatter(
                evoformer_input['pair'], axis=1)

        # ==================================================
        #  Main MSA Stack
        # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 17-18
        # ==================================================
        for idx, evoformer_block in enumerate(self.evoformer_iteration):
            msa_act, pair_act = recompute_wrapper(
                evoformer_block,
                evoformer_input['msa'],
                evoformer_input['pair'],
                evoformer_masks,
                is_recompute=self.training and
                idx >= self.config.evoformer_recompute_start_block_index)
            evoformer_output = {'msa': msa_act, 'pair': pair_act}
            evoformer_input = {
                'msa': evoformer_output['msa'],
                'pair': evoformer_output['pair'],
            }

        # gather if using dap, otherwise do nothing
        # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res, c_m]
        evoformer_output['msa'] = dap.gather(evoformer_output['msa'], axis=1)
        # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res, c_z]
        evoformer_output['pair'] = dap.gather(evoformer_output['pair'], axis=1)

        msa_activations = evoformer_output['msa']
        pair_activations = evoformer_output['pair']

        if not self.training and self.global_config.low_memory is True:
            pair_activations_cpu = pair_activations.cpu()
            del pair_activations
        single_activations = self.single_activations(msa_activations[:, 0])

        # if not self.training and self.global_config.low_memory is True:
        #     pair_act_out = pair_activations_cpu
        # else:
        #     pair_act_out = pair_activations

        num_seq = batch['msa_feat'].shape[1]
        output = {
            'single': single_activations,
            'pair': pair_activations_cpu if not self.training and
            self.global_config.low_memory is True else pair_activations,
            # Crop away template rows such that they are not used
            # in MaskedMsaHead.
            'msa': msa_activations[:, :num_seq],
            'msa_first_row': msa_activations[:, 0],
        }

        return output


================================================
FILE: ppfleetx/models/protein_folding/outer_product_mean.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn

from ppfleetx.distributed.protein_folding import dap

from .common import subbatch


class OuterProductMean(nn.Layer):
    """Computes mean outer product.

    Jumper et al. (2021) Suppl. Alg. 10 "OuterProductMean"
    """

    def __init__(self,
                 channel_num,
                 config,
                 global_config,
                 is_extra_msa,
                 name='outer_product_mean'):
        super(OuterProductMean, self).__init__()
        self.channel_num = channel_num
        self.config = config
        self.global_config = global_config

        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear

        if is_extra_msa:
            c_m = channel_num['extra_msa_channel']
        else:
            c_m = channel_num['msa_channel']

        self.layer_norm_input = nn.LayerNorm(c_m, name='layer_norm_input')
        self.left_projection = Linear(
            c_m, self.config.num_outer_channel, name='left_projection')
        self.right_projection = Linear(
            c_m, self.config.num_outer_channel, name='right_projection')

        if self.global_config.zero_init:
            init_w = nn.initializer.Constant(value=0.0)
        else:
            init_w = nn.initializer.KaimingNormal()

        self.output_w = paddle.create_parameter(
            [
                self.config.num_outer_channel, self.config.num_outer_channel,
                channel_num['pair_channel']
            ],
            'float32',
            default_initializer=init_w)
        self.output_b = paddle.create_parameter(
            [channel_num['pair_channel']],
            'float32',
            default_initializer=nn.initializer.Constant(value=0.0))

    def forward(self, act, mask):
        """Builds OuterProductMean module.

        Arguments:
        act: MSA representation, shape [batch, N_seq, N_res, c_m].
        mask: MSA mask, shape [batch, N_seq, N_res].

        Returns:
        Update to pair representation, shape [batch, N_res, N_res, c_z].
        """
        # [B, N_seq, N_res//dap_size, c_m]
        act = self.layer_norm_input(act)
        # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq, N_res//dap_size, num_outer_channel]
        right_act_before = self.right_projection(act)
        # [B, N_seq, N_res//dap_size, num_outer_channel] => [B, N_seq, N_res, num_outer_channel]
        right_act = dap.all_gather(right_act_before, axis=2)

        # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq, N_res//dap_size, num_outer_channel]
        left_act = self.left_projection(act)
        # [B, N_seq, N_res] => [B, N_seq, N_res, 1]
        mask = paddle.unsqueeze(mask, axis=-1)
        # [B, N_seq, N_res, 1] => [B, N_seq, N_res//dap_size, 1]
        mask_col = dap.scatter(mask, axis=2)
        left_act = mask_col * left_act

        # [B, N_seq, N_res//dap_size, 1], [B, N_seq, N_res, 1] => [B, N_res//dap_size, N_res, 1]
        epsilon = 1e-3
        norm = paddle.einsum('nabc,nadc->nbdc', mask_col, mask) + epsilon

        def fast_einsum(equation, left_act, right_act):
            assert equation == "nacb,nade->ndceb"
            tmp = paddle.matmul(
                x=paddle.reshape(
                    right_act,
                    [right_act.shape[0], right_act.shape[1], -1]),  # na(de)
                y=paddle.reshape(
                    left_act,
                    [left_act.shape[0], left_act.shape[1], -1]),  # na(cb)
                transpose_x=True,
                transpose_y=False)  # n(de)(cb)
            tmp = paddle.reshape(tmp, [
                left_act.shape[0], right_act.shape[2], right_act.shape[3],
                left_act.shape[2], left_act.shape[3]
            ])
            out = paddle.transpose(tmp, perm=[0, 1, 3, 2, 4])
            return out

        def compute_chunk(left_act, right_act):
            # This is equivalent to
            #
            # act = jnp.einsum('abc,ade->dceb', left_act, right_act)
            # act = jnp.einsum('dceb,cef->bdf', act, output_w) + output_b
            #
            # but faster. maybe for subbatch inference?

            # [B, N_seq, N_res//dap_size, num_outer_channel] => [B, N_seq, num_outer_channel, N_res//dap_size]
            left_act = left_act.transpose([0, 1, 3, 2])
            # wait if using async communication and dap, otherwise do nothing
            right_act_after = dap.all_gather_opp(right_act, axis=2)
            # [B, N_seq, num_outer_channel, N_res//dap_size], [B, N_seq, N_res, num_outer_channel]
            # => [B, N_res, num_outer_channel, num_outer_channel, N_res//dap_size]
            act = fast_einsum('nacb,nade->ndceb', left_act, right_act_after)

            # [B, N_res, num_outer_channel, num_outer_channel, N_res//dap_size], [num_outer_channel, num_outer_channel, c_z]
            # => [B, N_res, N_res//dap_size, c_z]
            act = paddle.einsum('ndceb,cef->ndbf', act,
                                self.output_w) + self.output_b
            # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z]
            return act.transpose([0, 2, 1, 3])

        if not self.training:
            # low memory mode using subbatch
            sb_chunk = subbatch(compute_chunk, [0], [2],
                                self.config.chunk_size, 1)
            act = sb_chunk(left_act, right_act)
        else:
            act = compute_chunk(left_act, right_act)

        act = act / norm

        return act


================================================
FILE: ppfleetx/models/protein_folding/quat_affine.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Quaternion geometry modules.

This introduces a representation of coordinate frames that is based around a
‘QuatAffine’ object. This object describes an array of coordinate frames.
It consists of vectors corresponding to the
origin of the frames as well as orientations which are stored in two
ways, as unit quaternions as well as a rotation matrices.
The rotation matrices are derived from the unit quaternions and the two are kept
in sync.
For an explanation of the relation between unit quaternions and rotations see
https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation

This representation is used in the model for the backbone frames.

One important thing to note here, is that while we update both representations
the jit compiler is going to ensure that only the parts that are
actually used are executed.
"""

import paddle
import functools
import numpy as np
from typing import Tuple

QUAT_TO_ROT = np.zeros((4, 4, 3, 3), dtype=np.float32)

QUAT_TO_ROT[0, 0] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]  # rr
QUAT_TO_ROT[1, 1] = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]  # ii
QUAT_TO_ROT[2, 2] = [[-1, 0, 0], [0, 1, 0], [0, 0, -1]]  # jj
QUAT_TO_ROT[3, 3] = [[-1, 0, 0], [0, -1, 0], [0, 0, 1]]  # kk

QUAT_TO_ROT[1, 2] = [[0, 2, 0], [2, 0, 0], [0, 0, 0]]  # ij
QUAT_TO_ROT[1, 3] = [[0, 0, 2], [0, 0, 0], [2, 0, 0]]  # ik
QUAT_TO_ROT[2, 3] = [[0, 0, 0], [0, 0, 2], [0, 2, 0]]  # jk

QUAT_TO_ROT[0, 1] = [[0, 0, 0], [0, 0, -2], [0, 2, 0]]  # ir
QUAT_TO_ROT[0, 2] = [[0, 0, 2], [0, 0, 0], [-2, 0, 0]]  # jr
QUAT_TO_ROT[0, 3] = [[0, -2, 0], [2, 0, 0], [0, 0, 0]]  # kr

QUAT_MULTIPLY = np.zeros((4, 4, 4), dtype=np.float32)
QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0],
                          [0, 0, 0, -1]]

QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1],
                          [0, 0, -1, 0]]

QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0],
                          [0, 1, 0, 0]]

QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0],
                          [1, 0, 0, 0]]

QUAT_MULTIPLY_BY_VEC = QUAT_MULTIPLY[:, 1:, :]


def rot_to_quat(rot):
    """Convert rotation matrix to quaternion.

    Note that this function calls self_adjoint_eig which is extremely expensive on
    the GPU. If at all possible, this function should run on the CPU.

    Args:
        rot: rotation matrix (see below for format). rotation matrix should be shape (..., 3, 3)

    Returns:
        Quaternion as (..., 4) tensor.
    """
    rot = [[rot[..., i, j] for j in range(3)] for i in range(3)]
    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = rot

    # pylint: disable=bad-whitespace
    k = [[
        xx + yy + zz,
        zy - yz,
        xz - zx,
        yx - xy,
    ], [
        zy - yz,
        xx - yy - zz,
        xy + yx,
        xz + zx,
    ], [
        xz - zx,
        xy + yx,
        yy - xx - zz,
        yz + zy,
    ], [
        yx - xy,
        xz + zx,
        yz + zy,
        zz - xx - yy,
    ]]

    k = (1. / 3.) * paddle.stack(
        [paddle.stack(
            x, axis=-1) for x in k], axis=-2)

    # Get eigenvalues in non-decreasing order and associated.
    _, qs = paddle.linalg.eigh(k)
    return qs[..., -1]


def quat_to_rot(normalized_quat):
    """Convert a normalized quaternion to a rotation matrix. Quat (..., 4)"""

    mat = paddle.unsqueeze(normalized_quat,
                           [-1, -3])  # normalized_quat[..., None, :, None]
    rot_tensor = paddle.sum(
        paddle.to_tensor(np.reshape(QUAT_TO_ROT, (4, 4, 9))) *
        normalized_quat[..., :, None, None] * mat,
        axis=(-3, -2))  # (..., 4, 4, 9) -> (..., 9)
    t_shape = rot_tensor.shape[:-1]
    t_shape.extend([3, 3])
    rot = paddle.reshape(rot_tensor, t_shape)  # Unstack. (..., 3, 3)
    return rot


def quat_multiply_by_vec(quat, vec):
    """Multiply a quaternion by a pure-vector quaternion."""
    mat = paddle.unsqueeze(vec, [-1, -3])  # vec[..., None, :, None]
    return paddle.sum(paddle.to_tensor(QUAT_MULTIPLY_BY_VEC) *
                      quat[..., :, None, None] * mat,
                      axis=(-3, -2))


def quat_multiply(quat1, quat2):
    """Multiply a quaternion by another quaternion."""
    mat = paddle.unsqueeze(quat2, [-1, -3])  # quat2[..., None, :, None]
    return paddle.sum(paddle.to_tensor(QUAT_MULTIPLY) *
                      quat1[..., :, None, None] * mat,
                      axis=(-3, -2))


def apply_rot_to_vec(rot, vec, unstack=False):
    """Multiply rotation matrix by a vector. vec is a list.
    Returns: a list of 3 tensors of the points
    """
    if unstack:
        x, y, z = [vec[..., i] for i in range(3)]
    else:
        x, y, z = vec
    return [
        rot[..., 0, 0] * x + rot[..., 0, 1] * y + rot[..., 0, 2] * z,
        rot[..., 1, 0] * x + rot[..., 1, 1] * y + rot[..., 1, 2] * z,
        rot[..., 2, 0] * x + rot[..., 2, 1] * y + rot[..., 2, 2] * z
    ]


def apply_rot_to_vec_np(rot, vec, unstack=False):
    """Multiply rotation matrix by a vector. vec is a list.
    Returns: a list of 3 tensors of the points
    """
    if unstack:
        x, y, z = [vec[..., i] for i in range(3)]
    else:
        x, y, z = vec
    return [
        rot[0][0] * x + rot[0][1] * y + rot[0][2] * z,
        rot[1][0] * x + rot[1][1] * y + rot[1][2] * z,
        rot[2][0] * x + rot[2][1] * y + rot[2][2] * z
    ]


def apply_inverse_rot_to_vec(rot, vec):
    """Multiply the inverse of a rotation matrix by a vector. vec is a list.
    Returns: a list of 3 tensors of the points
    """
    # Inverse rotation is just transpose
    x, y, z = vec
    return [
        rot[..., 0, 0] * x + rot[..., 1, 0] * y + rot[..., 2, 0] * z,
        rot[..., 0, 1] * x + rot[..., 1, 1] * y + rot[..., 2, 1] * z,
        rot[..., 0, 2] * x + rot[..., 1, 2] * y + rot[..., 2, 2] * z
    ]


class QuatAffine(object):
    """Affine transformation represented by quaternion and vector."""

    def __init__(self,
                 quaternion: paddle.Tensor,
                 translation: paddle.Tensor,
                 rotation=None,
                 normalize=True):
        """Initialize from quaternion and translation.

        Args:
        quaternion: Rotation represented by a quaternion, to be applied
            before translation.  Must be a unit quaternion unless normalize==True.
            shape (batch, N_res, 4)
        translation: Translation represented as a vector. (batch, N_res, 3)
        rotation: Same rotation as the quaternion, represented as a (batch, N_res, 3, 3)
            tensor.  If None, rotation will be calculated from the quaternion.
        normalize: If True, l2 normalize the quaternion on input.
        """

        if quaternion is not None:
            assert quaternion.shape[-1] == 4

        if normalize and quaternion is not None:
            q_length = paddle.norm(quaternion, axis=-1)
            quaternion = quaternion / q_length[..., None]

        if rotation is None:
            rotation = quat_to_rot(quaternion)

        self.quaternion = quaternion
        self.rotation = rotation
        self.translation = translation

        assert rotation.shape[-1] == 3 and rotation.shape[-2] == 3
        assert translation.shape[-1] == 3

    def to_tensor(self):
        return paddle.concat([self.quaternion, self.translation], axis=-1)

    def stop_rot_gradient(self):
        """
            stop the gradient of rotations
        """
        quat = self.quaternion
        if not quat is None:
            quat = quat.detach()
        return QuatAffine(
            quaternion=quat,
            translation=self.translation,
            rotation=self.rotation.detach(),
            normalize=False)

    def scale_translation(self, position_scale):
        """Return a new quat affine with a different scale for translation."""

        return QuatAffine(
            self.quaternion,
            position_scale * self.translation,
            rotation=self.rotation,
            normalize=False)

    @classmethod
    def from_tensor(cls, tensor, normalize=False):
        assert tensor.shape[-1] == 7
        quaternion = tensor[..., 0:4]
        translation = tensor[..., 4:7]
        return cls(quaternion, translation, normalize=normalize)

    def pre_compose(self, update):
        """Return a new QuatAffine which applies the transformation update first.

        Args:
        update: Length-6 vector. 3-vector of x, y, and z such that the quaternion
            update is (1, x, y, z) and zero for the 3-vector is the identity
            quaternion. 3-vector for translation concatenated.

        Returns:
        New QuatAffine object.
        """
        vector_quaternion_update = update[..., 0:3]
        trans_update = [update[..., 3], update[..., 4], update[..., 5]]

        new_quaternion = (self.quaternion + quat_multiply_by_vec(
            self.quaternion, vector_quaternion_update))

        trans_update = apply_rot_to_vec(self.rotation, trans_update)
        trans_update = paddle.stack(trans_update, axis=-1)
        new_translation = self.translation + trans_update

        return QuatAffine(new_quaternion, new_translation)

    def apply_to_point(self, point, extra_dims=0):
        """Apply affine to a point.

        Args:
        point: List of 3 tensors to apply affine.
            each with shape [batch_size, num_residues, num_head*num_point_qk]
        extra_dims:  Number of dimensions at the end of the transformed_point
            shape that are not present in the rotation and translation.  The most
            common use is rotation N points at once with extra_dims=1 for use in a
            network.

        Returns:
        Transformed point after applying affine.
        """
        rotation = self.rotation  # [batch_size, num_residues, 3, 3]
        translation = self.translation  # [batch_size, num_residues, 3]
        for _ in range(extra_dims):
            translation = paddle.unsqueeze(translation, axis=-2)
            rotation = paddle.unsqueeze(rotation, axis=-3)

        rot_point = apply_rot_to_vec(rotation, point)
        return [
            rot_point[0] + translation[..., 0],
            rot_point[1] + translation[..., 1],
            rot_point[2] + translation[..., 2]
        ]

    def invert_point(self, transformed_point, extra_dims=0):
        """Apply inverse of transformation to a point.

        Args:
        transformed_point: List of 3 tensors to apply affine
        extra_dims:  Number of dimensions at the end of the transformed_point
            shape that are not present in the rotation and translation.  The most
            common use is rotation N points at once with extra_dims=1 for use in a
            network.

        Returns:
        Transformed point after applying affine.
        """
        rotation = self.rotation
        translation = self.translation
        for _ in range(extra_dims):
            translation = paddle.unsqueeze(translation, axis=-2)
            rotation = paddle.unsqueeze(rotation, axis=-3)

        rot_point = [
            transformed_point[0] - translation[..., 0],
            transformed_point[1] - translation[..., 1],
            transformed_point[2] - translation[..., 2]
        ]

        return apply_inverse_rot_to_vec(rotation, rot_point)

    def invert(self):
        """Return a new quat affine of the invert transformation."""
        pass  # TODO


######Paddle Implementation
def _multiply(a, b):
    a1 = a[..., 0, 0]
    a2 = a[..., 0, 1]
    a3 = a[..., 0, 2]
    a11 = a[..., 1, 0]
    a12 = a[..., 1, 1]
    a13 = a[..., 1, 2]
    a21 = a[..., 2, 0]
    a22 = a[..., 2, 1]
    a23 = a[..., 2, 2]
    b1 = b[..., 0, 0]
    b2 = b[..., 1, 0]
    b3 = b[..., 0, 1]
    b11 = b[..., 1, 1]
    b12 = b[..., 2, 0]
    b13 = b[..., 0, 2]
    b21 = b[..., 1, 2]
    b22 = b[..., 2, 1]
    b23 = b[..., 2, 2]
    return paddle.stack(
        [
            paddle.stack(
                [
                    a1 * b1 + a2 * b2 + a3 * b12,
                    a1 * b3 + a2 * b11 + a3 * b22,
                    a1 * b13 + a2 * b21 + a3 * b23
                ],
                axis=-1), paddle.stack(
                    [
                        a11 * b1 + a12 * b2 + a13 * b12,
                        a11 * b3 + a12 * b11 + a13 * b22,
                        a11 * b13 + a12 * b21 + a13 * b23
                    ],
                    axis=-1), paddle.stack(
                        [
                            a21 * b1 + a22 * b2 + a23 * b12,
                            a21 * b3 + a22 * b11 + a23 * b22,
                            a21 * b13 + a22 * b21 + a23 * b23
                        ],
                        axis=-1)
        ],
        axis=-2)


def make_canonical_transform(
        n_xyz: paddle.Tensor, ca_xyz: paddle.Tensor,
        c_xyz: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """Returns translation and rotation matrices to canonicalize residue atoms.

    Note that this method does not take care of symmetries. If you provide the
    atom positions in the non-standard way, the N atom will end up not at
    [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
    need to take care of such cases in your code.

    Args:
        n_xyz: An array of shape [batch, n_res, 3] of nitrogen xyz coordinates.
        ca_xyz: An array of shape [batch, n_res, 3] of carbon alpha xyz coordinates.
        c_xyz: An array of shape [batch, n_res, 3] of carbon xyz coordinates.

    Returns:
        A tuple (translation, rotation) where:
        translation is an array of shape [batch, n_res, 3] defining the translation.
        rotation is an array of shape [batch, n_res, 3, 3] defining the rotation.
        After applying the translation and rotation to all atoms in a residue:
        * All atoms will be shifted so that CA is at the origin,
        * All atoms will be rotated so that C is at the x-axis,
        * All atoms will be shifted so that N is in the xy plane.
    """
    assert len(n_xyz.shape) == 3, n_xyz.shape
    assert n_xyz.shape[-1] == 3, n_xyz.shape
    assert n_xyz.shape == ca_xyz.shape == c_xyz.shape, (
        n_xyz.shape, ca_xyz.shape, c_xyz.shape)

    # Place CA at the origin.
    translation = -ca_xyz
    n_xyz = n_xyz + translation
    c_xyz = c_xyz + translation

    # Place C on the x-axis.
    c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
    # Rotate by angle c1 in the x-y plane (around the z-axis).
    norm = paddle.sqrt(c_x**2 + c_y**2 + 1e-20)
    sin_c1 = -c_y / norm
    cos_c1 = c_x / norm
    zeros = paddle.zeros_like(sin_c1)
    ones = paddle.ones_like(sin_c1)

    c1_rot_matrix = paddle.stack(
        [cos_c1, -sin_c1, zeros, sin_c1, cos_c1, zeros, zeros, zeros, ones],
        axis=-1)
    c1_rot_matrix = c1_rot_matrix.reshape(sin_c1.shape + [3, 3])

    # Rotate by angle c2 in the x-z plane (around the y-axis).
    # norm = paddle.sqrt(1e-20 + c_x ** 2 + c_y ** 2 + c_z ** 2)
    norm = paddle.sqrt(paddle.sum(c_xyz**2, axis=-1)) + 1e-20
    sin_c2 = c_z / norm
    cos_c2 = paddle.sqrt(c_x**2 + c_y**2) / norm
    c2_rot_matrix = paddle.stack(
        [cos_c2, zeros, sin_c2, zeros, ones, zeros, -sin_c2, zeros, cos_c2],
        axis=-1)
    c2_rot_matrix = c2_rot_matrix.reshape(sin_c2.shape + [3, 3])

    c_rot_matrix = _multiply(c2_rot_matrix, c1_rot_matrix)
    n_xyz = paddle.stack(
        apply_rot_to_vec(
            c_rot_matrix, n_xyz, unstack=True), axis=-1)

    # Place N in the x-y plane.
    _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
    # Rotate by angle alpha in the y-z plane (around the x-axis).
    norm = paddle.sqrt(n_y**2 + n_z**2 + 1e-20)
    sin_n = -n_z / norm
    cos_n = n_y / norm
    n_rot_matrix = paddle.stack(
        [ones, zeros, zeros, zeros, cos_n, -sin_n, zeros, sin_n, cos_n],
        axis=-1)
    n_rot_matrix = n_rot_matrix.reshape(sin_n.shape + [3, 3])
    # pylint: enable=bad-whitespace

    return (translation, _multiply(n_rot_matrix, c_rot_matrix))


def make_transform_from_reference(
        n_xyz: paddle.Tensor, ca_xyz: paddle.Tensor,
        c_xyz: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """Returns rotation and translation matrices to convert from reference.

    Note that this method does not take care of symmetries. If you provide the
    atom positions in the non-standard way, the N atom will end up not at
    [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
    need to take care of such cases in your code.

    Args:
        n_xyz: An array of shape [batch, n_res, 3] of nitrogen xyz coordinates.
        ca_xyz: An array of shape [batch, n_res, 3] of carbon alpha xyz coordinates.
        c_xyz: An array of shape [batch, n_res, 3] of carbon xyz coordinates.

    Returns:
        A tuple (rotation, translation) where:
        rotation is an array of shape [batch, n_res, 3, 3] defining the rotation.
        translation is an array of shape [batch, n_res, 3] defining the translation.
        After applying the translation and rotation to the reference backbone,
        the coordinates will approximately equal to the input coordinates.

        The order of translation and rotation differs from make_canonical_transform
        because the rotation from this function should be applied before the
        translation, unlike make_canonical_transform.
    """
    translation, rotation = make_canonical_transform(n_xyz, ca_xyz, c_xyz)
    return paddle.transpose(rotation, (0, 1, 3, 2)), -translation


#######Numpy Implementation
def _multiply_np(a, b):
    return np.stack([
        np.array([
            a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0],
            a[0][0] * b[0][1] + a[0][1] * b[1][1] + a[0][2] * b[2][1],
            a[0][0] * b[0][2] + a[0][1] * b[1][2] + a[0][2] * b[2][2]
        ]), np.array([
            a[1][0] * b[0][0] + a[1][1] * b[1][0] + a[1][2] * b[2][0],
            a[1][0] * b[0][1] + a[1][1] * b[1][1] + a[1][2] * b[2][1],
            a[1][0] * b[0][2] + a[1][1] * b[1][2] + a[1][2] * b[2][2]
        ]), np.array([
            a[2][0] * b[0][0] + a[2][1] * b[1][0] + a[2][2] * b[2][0],
            a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1],
            a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2]
        ])
    ])


def make_canonical_transform_np(
        n_xyz: np.ndarray, ca_xyz: np.ndarray,
        c_xyz: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Returns translation and rotation matrices to canonicalize residue atoms.

    Note that this method does not take care of symmetries. If you provide the
    atom positions in the non-standard way, the N atom will end up not at
    [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
    need to take care of such cases in your code.

    Args:
        n_xyz: An array of shape [batch, 3] of nitrogen xyz coordinates.
        ca_xyz: An array of shape [batch, 3] of carbon alpha xyz coordinates.
        c_xyz: An array of shape [batch, 3] of carbon xyz coordinates.

    Returns:
        A tuple (translation, rotation) where:
        translation is an array of shape [batch, 3] defining the translation.
        rotation is an array of shape [batch, 3, 3] defining the rotation.
        After applying the translation and rotation to all atoms in a residue:
        * All atoms will be shifted so that CA is at the origin,
        * All atoms will be rotated so that C is at the x-axis,
        * All atoms will be shifted so that N is in the xy plane.
    """
    assert len(n_xyz.shape) == 2, n_xyz.shape
    assert n_xyz.shape[-1] == 3, n_xyz.shape
    assert n_xyz.shape == ca_xyz.shape == c_xyz.shape, (
        n_xyz.shape, ca_xyz.shape, c_xyz.shape)

    # Place CA at the origin.
    translation = -ca_xyz
    n_xyz = n_xyz + translation
    c_xyz = c_xyz + translation

    # Place C on the x-axis.
    c_x, c_y, c_z = [c_xyz[:, i] for i in range(3)]
    # Rotate by angle c1 in the x-y plane (around the z-axis).
    sin_c1 = -c_y / np.sqrt(1e-20 + c_x**2 + c_y**2)
    cos_c1 = c_x / np.sqrt(1e-20 + c_x**2 + c_y**2)
    zeros = np.zeros_like(sin_c1)
    ones = np.ones_like(sin_c1)
    # pylint: disable=bad-whitespace
    c1_rot_matrix = np.stack([
        np.array([cos_c1, -sin_c1, zeros]), np.array([sin_c1, cos_c1, zeros]),
        np.array([zeros, zeros, ones])
    ])

    # Rotate by angle c2 in the x-z plane (around the y-axis).
    sin_c2 = c_z / np.sqrt(1e-20 + c_x**2 + c_y**2 + c_z**2)
    cos_c2 = np.sqrt(c_x**2 + c_y**2) / np.sqrt(1e-20 + c_x**2 + c_y**2 + c_z**
                                                2)
    c2_rot_matrix = np.stack([
        np.array([cos_c2, zeros, sin_c2]), np.array([zeros, ones, zeros]),
        np.array([-sin_c2, zeros, cos_c2])
    ])

    c_rot_matrix = _multiply_np(c2_rot_matrix, c1_rot_matrix)
    n_xyz = np.stack(apply_rot_to_vec_np(c_rot_matrix, n_xyz, unstack=True)).T

    # Place N in the x-y plane.
    _, n_y, n_z = [n_xyz[:, i] for i in range(3)]
    # Rotate by angle alpha in the y-z plane (around the x-axis).
    sin_n = -n_z / np.sqrt(1e-20 + n_y**2 + n_z**2)
    cos_n = n_y / np.sqrt(1e-20 + n_y**2 + n_z**2)
    n_rot_matrix = np.stack([
        np.array([ones, zeros, zeros]), np.array([zeros, cos_n, -sin_n]),
        np.array([zeros, sin_n, cos_n])
    ])

    return (translation, np.transpose(
        _multiply_np(n_rot_matrix, c_rot_matrix), [2, 0, 1]))


def make_transform_from_reference_np(
        n_xyz: np.ndarray, ca_xyz: np.ndarray,
        c_xyz: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Returns rotation and translation matrices to convert from reference.

  Note that this method does not take care of symmetries. If you provide the
  atom positions in the non-standard way, the N atom will end up not at
  [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
  need to take care of such cases in your code.

  Args:
    n_xyz: An array of shape [batch, 3] of nitrogen xyz coordinates.
    ca_xyz: An array of shape [batch, 3] of carbon alpha xyz coordinates.
    c_xyz: An array of shape [batch, 3] of carbon xyz coordinates.

  Returns:
    A tuple (rotation, translation) where:
      rotation is an array of shape [batch, 3, 3] defining the rotation.
      translation is an array of shape [batch, 3] defining the translation.
    After applying the translation and rotation to the reference backbone,
    the coordinates will approximately equal to the input coordinates.

    The order of translation and rotation differs from make_canonical_transform
    because the rotation from this function should be applied before the
    translation, unlike make_canonical_transform.
  """
    translation, rotation = make_canonical_transform_np(n_xyz, ca_xyz, c_xyz)
    return np.transpose(rotation, (0, 2, 1)), -translation


================================================
FILE: ppfleetx/models/protein_folding/r3.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformations for 3D coordinates.

This Module contains objects for representing Vectors (Vecs), Rotation Matrices
(Rots) and proper Rigid transformation (Rigids). These are represented as
named tuples with arrays for each entry, for example a set of
[N, M] points would be represented as a Vecs object with arrays of shape [N, M]
for x, y and z.

This is being done to improve readability by making it very clear what objects
are geometric objects rather than relying on comments and array shapes.
Another reason for this is to avoid using matrix
multiplication primitives like matmul or einsum, on modern accelerator hardware
these can end up on specialized cores such as tensor cores on GPU or the MXU on
cloud TPUs, this often involves lower computational precision which can be
problematic for coordinate geometry. Also these cores are typically optimized
for larger matrices than 3 dimensional, this code is written to avoid any
unintended use of these cores on both GPUs and TPUs.
"""
import paddle
import numpy as np
import collections
from typing import List

from . import (quat_affine, )

# Array of rigid 3D transformations, stored as array of rotations and
# array of translations.
Rigids = collections.namedtuple('Rigids', ['rot', 'trans'])


class Vecs:
    def __init__(self, *args):

        if len(args) == 1:
            if type(args[0]) in [list, tuple] and len(args[0]) == 3:
                self.translation = paddle.stack(args[0], axis=-1)
            elif len(args[0]) == 1:
                self.translation = args[0]
            elif args[0].shape[-1] == 3:
                self.translation = args[0]
            else:
                raise ValueError('Invalid number of inputs')
        elif len(args) == 3:
            self.translation = paddle.stack(args, axis=-1)
        else:
            raise ValueError('Invalid number of inputs')

    def map(self, map_fn, *args):
        result = []
        for i in range(3):
            r = map_fn(self.translation[..., i], *args)
            result.append(r)

        if result[0].shape[-1] == 1:
            return Vecs(paddle.concat(result, axis=-1))
        else:
            return Vecs(paddle.stack(result, axis=-1))

    @property
    def shape(self):
        return self.translation.shape

    @property
    def x(self):
        return self.translation[..., 0]

    @property
    def y(self):
        return self.translation[..., 1]

    @property
    def z(self):
        return self.translation[..., 2]

    def __getitem__(self, index):
        return Vecs(self.translation[index])

    def __str__(self):
        return str(self.translation.shape)

    def __repr__(self):
        return str(self.translation.shape)

    def reshape(self, *argv):
        return self.translation.reshape(*argv)


class Rots:
    def __init__(self, *args):
        if len(args) == 1:
            args = args[0]
            if len(args) == 9:
                rots = paddle.stack(args, axis=-1)
                self.rotation = rots.reshape(rots.shape[:-1] + [3, 3])
            else:
                if args.shape[-1] == 3 and args.shape[-2] == 3:
                    self.rotation = args
                elif args.shape[-1] == 9:
                    self.rotation = args.reshape(args.shape[:-1] + [3, 3])
                else:
                    raise ValueError('Invalid shape of input')
        elif len(args) == 9:
            rots = paddle.stack(args, axis=-1)
            self.rotation = rots.reshape(rots.shape[:-1] + [3, 3])
        else:
            raise ValueError('Invalid number of inputs')

    def map(self, map_fn, *args):
        result_i = []
        for i in range(3):
            result_j = []
            for j in range(3):
                r = map_fn(self.rotation[..., i, j], *args)
                result_j.append(r)

            if result_j[0].shape[-1] == 1:
                result_i.append(paddle.concat(result_j, axis=-1))
            else:
                result_i.append(paddle.stack(result_j, axis=-1))

        return Rots(paddle.stack(result_i, axis=-2))

    @property
    def shape(self):
        return self.rotation.shape

    @property
    def xx(self):
        return self.rotation[..., 0, 0]

    @property
    def xy(self):
        return self.rotation[..., 0, 1]

    @property
    def xz(self):
        return self.rotation[..., 0, 2]

    @property
    def yx(self):
        return self.rotation[..., 1, 0]

    @property
    def yy(self):
        return self.rotation[..., 1, 1]

    @property
    def yz(self):
        return self.rotation[..., 1, 2]

    @property
    def zx(self):
        return self.rotation[..., 2, 0]

    @property
    def zy(self):
        return self.rotation[..., 2, 1]

    @property
    def zz(self):
        return self.rotation[..., 2, 2]

    def __getitem__(self, index):
        return Rots(self.rotation[index])

    def __str__(self):
        return str(self.rotation.shape)

    def __repr__(self):
        return str(self.rotation.shape)

    def reshape(self, *argv):
        return self.rotation.reshape(*argv)


def squared_difference(x, y):
    return paddle.square(x - y)


def invert_rigids(r: Rigids) -> Rigids:
    """Computes group inverse of rigid transformations 'r'."""
    inv_rots = invert_rots(r.rot)
    t = rots_mul_vecs(inv_rots, r.trans)
    inv_trans = Vecs(-1 * t.translation)
    return Rigids(inv_rots, inv_trans)


def invert_rots(m: Rots) -> Rots:
    """Computes inverse of rotations 'm'."""
    return Rots(m.xx, m.yx, m.zx, m.xy, m.yy, m.zy, m.xz, m.yz, m.zz)


def rigids_from_3_points_vecs(
        point_on_neg_x_axis: Vecs,
        origin: Vecs,
        point_on_xy_plane: Vecs, ) -> Rigids:
    """Create Rigids from 3 points.

  Jumper et al. (2021) Suppl. Alg. 21 "rigidFrom3Points"
  This creates a set of rigid transformations from 3 points by Gram Schmidt
  orthogonalization.

  Args:
    point_on_neg_x_axis: Vecs corresponding to points on the negative x axis
    origin: Origin of resulting rigid transformations
    point_on_xy_plane: Vecs corresponding to points in the xy plane
  Returns:
    Rigid transformations from global frame to local frames derived from
    the input points.
  """
    m = rots_from_two_vecs(
        e0_unnormalized=vecs_sub(origin, point_on_neg_x_axis),
        e1_unnormalized=vecs_sub(point_on_xy_plane, origin))

    return Rigids(rot=m, trans=origin)


def rigids_from_3_points(point_on_neg_x_axis: paddle.Tensor,
                         origin: paddle.Tensor,
                         point_on_xy_plane: paddle.Tensor,
                         eps: float=1e-8) -> Rigids:
    """Create Rigids from 3 points.

    Jumper et al. (2021) Suppl. Alg. 21 "rigidFrom3Points"
    This creates a set of rigid transformations from 3 points by Gram Schmidt
    orthogonalization.

    Argss:
        point_on_neg_x_axis: [*, 3] coordinates
        origin: [*, 3] coordinates
        point_on_xy_plane: [*, 3] coordinates
        eps: small regularizer added to squared norm before taking square root.
    Returns:
        Rigids corresponding to transformations from global frame
        to local frames derived from the input points.
    """
    point_on_neg_x_axis = paddle.unbind(point_on_neg_x_axis, axis=-1)
    origin = paddle.unbind(origin, axis=-1)
    point_on_xy_plane = paddle.unbind(point_on_xy_plane, axis=-1)

    e0 = [c1 - c2 for c1, c2 in zip(origin, point_on_neg_x_axis)]
    e1 = [c1 - c2 for c1, c2 in zip(point_on_xy_plane, origin)]

    norms = paddle.sqrt(
        paddle.square(e0[0]) + paddle.square(e0[1]) + paddle.square(e0[2]) +
        eps)
    e0 = [c / norms for c in e0]
    dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
    e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
    norms = paddle.sqrt(
        paddle.square(e1[0]) + paddle.square(e1[1]) + paddle.square(e1[2]) +
        eps)
    e1 = [c / norms for c in e1]
    e2 = [
        e0[1] * e1[2] - e0[2] * e1[1],
        e0[2] * e1[0] - e0[0] * e1[2],
        e0[0] * e1[1] - e0[1] * e1[0],
    ]

    rots = paddle.stack([c for tup in zip(e0, e1, e2) for c in tup], axis=-1)

    return Rigids(Rots(rots), Vecs(origin))


def rigids_from_list(l: List[paddle.Tensor]) -> Rigids:
    """Converts flat list of arrays to rigid transformations."""
    assert len(l) == 12
    return Rigids(Rots(*(l[:9])), Vecs(*(l[9:])))


def rigids_from_quataffine(a: quat_affine.QuatAffine) -> Rigids:
    """Converts QuatAffine object to the corresponding Rigids object."""
    return Rigids(Rots(a.rotation), Vecs(a.translation))


def rigids_from_tensor4x4(m: paddle.Tensor) -> Rigids:
    """Construct Rigids from an 4x4 array.

    Here the 4x4 is representing the transformation in homogeneous coordinates.

    Argss:
        m: [*, 4, 4] homogenous transformation tensor
    Returns:
        Rigids corresponding to transformations m
    """
    assert m.shape[-1] == 4
    assert m.shape[-2] == 4
    sliced_m = m[..., 0:3, :]  # shape is [..., 3, 4]
    outs = paddle.split(sliced_m, num_or_sections=[3, 1], axis=-1)
    return Rigids(Rots(outs[0]), Vecs(outs[1].squeeze_(axis=-1)))


def rigids_from_tensor_flat9(m: paddle.Tensor) -> Rigids:
    """Flat9 encoding: first two columns of rotation matrix + translation."""
    assert m.shape[-1] == 9
    e0 = Vecs(m[..., 0], m[..., 1], m[..., 2])
    e1 = Vecs(m[..., 3], m[..., 4], m[..., 5])
    trans = Vecs(m[..., 6], m[..., 7], m[..., 8])
    return Rigids(rot=rots_from_two_vecs(e0, e1), trans=trans)


def rigids_from_tensor_flat12(m: paddle.Tensor  # shape (..., 12)
                              ) -> Rigids:  # shape (...)
    """Flat12 encoding: rotation matrix (9 floats) + translation (3 floats)."""
    assert m.shape[-1] == 12
    return Rigids(Rots(m[..., :9]), Vecs(m[..., 9:]))


def rigids_mul_rigids(a: Rigids, b: Rigids) -> Rigids:
    """Group composition of Rigids 'a' and 'b'."""
    return Rigids(
        rots_mul_rots(a.rot, b.rot),
        vecs_add(a.trans, rots_mul_vecs(a.rot, b.trans)))


def rigids_mul_rots(r: Rigids, m: Rots) -> Rigids:
    """Compose rigid transformations 'r' with rotations 'm'."""
    return Rigids(rots_mul_rots(r.rot, m), r.trans)


def rigids_mul_vecs(r: Rigids, v: Vecs) -> Vecs:
    """Apply rigid transforms 'r' to points 'v'."""
    return vecs_add(rots_mul_vecs(r.rot, v), r.trans)


def rigids_to_list(r: Rigids) -> List[paddle.Tensor]:
    """Turn Rigids into flat list, inverse of 'rigids_from_list'."""
    return list(r.rot) + list(r.trans)


def rigids_to_quataffine(r: Rigids) -> quat_affine.QuatAffine:
    """Convert Rigids r into QuatAffine, inverse of 'rigids_from_quataffine'."""
    return quat_affine.QuatAffine(
        quaternion=None,
        rotation=r.rot.rotation,
        translation=r.trans.translation)


def rigids_to_tensor_flat9(r: Rigids) -> paddle.Tensor:  # shape (..., 9)
    """Flat9 encoding: first two columns of rotation matrix + translation."""
    return paddle.stack(
        [r.rot.xx, r.rot.yx, r.rot.zx, r.rot.xy, r.rot.yy, r.rot.zy] +
        list(r.trans),
        axis=-1)


def rigids_to_tensor_flat12(r: Rigids  # shape (...)
                            ) -> paddle.Tensor:  # shape (..., 12)
    """Flat12 encoding: rotation matrix (9 floats) + translation (3 floats)."""
    return paddle.stack(
        [
            r.rot.xx, r.rot.yx, r.rot.zx, r.rot.xy, r.rot.yy, r.rot.zy,
            r.rot.xz, r.rot.yz, r.rot.zz
        ] + [r.trans.x, r.trans.y, r.trans.z],
        axis=-1)


def rots_from_tensor3x3(
        m: paddle.Tensor,  # shape (..., 3, 3)
) -> Rots:  # shape (...)
    """Convert rotations represented as (3, 3) array to Rots."""
    assert m.shape[-1] == 3
    assert m.shape[-2] == 3
    return Rots(m)


def rots_from_two_vecs(e0_unnormalized: Vecs, e1_unnormalized: Vecs) -> Rots:
    """Create rotation matrices from unnormalized vectors for the x and y-axes.

    This creates a rotation matrix from two vectors using Gram-Schmidt
    orthogonalization.

    Args:
        e0_unnormalized: vectors lying along x-axis of resulting rotation
        e1_unnormalized: vectors lying in xy-plane of resulting rotation
    Returns:
        Rotations resulting from Gram-Schmidt procedure.
    """
    # Normalize the unit vector for the x-axis, e0.
    e0 = vecs_robust_normalize(e0_unnormalized)

    # make e1 perpendicular to e0.
    c = vecs_dot_vecs(e1_unnormalized, e0)
    e1 = Vecs(e1_unnormalized.translation - c.unsqueeze_(axis=-1) *
              e0.translation)
    e1 = vecs_robust_normalize(e1)

    # Compute e2 as cross product of e0 and e1.
    e2 = vecs_cross_vecs(e0, e1)

    return Rots(
        paddle.stack(
            [e0.translation, e1.translation, e2.translation], axis=-1))


def broadcast_shape(x_shape, y_shape):
    if x_shape == y_shape or len(x_shape) > len(y_shape):
        out_shape = x_shape
    elif len(y_shape) > len(x_shape):
        out_shape = y_shape
    else:
        out_shape = []
        for i in range(len(x_shape)):
            if x_shape[i] == y_shape[i] or y_shape[i] == 1:
                out_shape.append(x_shape[i])
            elif x_shape[i] == 1:
                out_shape.append(y_shape[i])
            else:
                raise ValueError("{} and {} cannot braodcast.".format(x_shape,
                                                                      y_shape))
    return out_shape


def broadcast_to(x, broadcast_shape):
    if x.shape == broadcast_shape:
        return x
    else:
        return paddle.broadcast_to(x, broadcast_shape)


def rots_mul_rots(a: Rots, b: Rots) -> Rots:
    """Composition of rotations 'a' and 'b'."""
    out_shape = broadcast_shape(a.shape, b.shape)
    broadcasted_a = broadcast_to(a.rotation, out_shape)
    broadcasted_b = broadcast_to(b.rotation, out_shape)
    return Rots(paddle.matmul(broadcasted_a, broadcasted_b))


def rots_mul_vecs(m: Rots, v: Vecs) -> Vecs:
    """Apply rotations 'm' to vectors 'v'."""
    if m.shape[:-2] == v.shape[:-1]:
        broadcasted_m = m.rotation
        broadcasted_v = v.translation
    else:
        out_shape = broadcast_shape(m.shape[:-2], v.shape[:-1])
        broadcasted_m = broadcast_to(m.rotation, out_shape + [3, 3])
        broadcasted_v = broadcast_to(v.translation, out_shape + [3])
    return Vecs(
        paddle.matmul(
            broadcasted_m, broadcasted_v.unsqueeze(axis=-1)).squeeze_(axis=-1))


def vecs_add(v1: Vecs, v2: Vecs) -> Vecs:
    """Add two vectors 'v1' and 'v2'."""
    return Vecs(v1.translation + v2.translation)


def vecs_dot_vecs(v1: Vecs, v2: Vecs) -> paddle.Tensor:
    """Dot product of vectors 'v1' and 'v2'."""
    return v1.x * v2.x + v1.y * v2.y + v1.z * v2.z


def vecs_cross_vecs(v1: Vecs, v2: Vecs) -> Vecs:
    """Cross product of vectors 'v1' and 'v2'."""
    return Vecs(paddle.cross(v1.translation, v2.translation, axis=-1))


def vecs_from_tensor(x: paddle.Tensor  # shape (..., 3)
                     ) -> Vecs:  # shape (...)
    """Converts from tensor of shape (3,) to Vecs."""
    assert x.shape[-1] == 3
    return Vecs(x)


def vecs_robust_normalize(v: Vecs, epsilon: float=1e-8) -> Vecs:
    """Normalizes vectors 'v'.

    Argss:
        v: vectors to be normalized.
        epsilon: small regularizer added to squared norm before taking square root.
    Returns:
        normalized vectors
    """
    norms = vecs_robust_norm(v, epsilon)
    return Vecs(v.translation / norms.unsqueeze_(axis=-1))


def vecs_robust_norm(v: Vecs, epsilon: float=1e-8) -> paddle.Tensor:
    """Computes norm of vectors 'v'.

    Args:
        v: vectors to be normalized.
        epsilon: small regularizer added to squared norm before taking square root.
    Returns:
        norm of 'v'
    """
    return paddle.sqrt(
        paddle.square(v.x) + paddle.square(v.y) + paddle.square(v.z) + epsilon)


def vecs_sub(v1: Vecs, v2: Vecs) -> Vecs:
    """Computes v1 - v2."""
    return Vecs(v1.translation - v2.translation)


def vecs_squared_distance(v1: Vecs, v2: Vecs) -> paddle.Tensor:
    """Computes squared euclidean difference between 'v1' and 'v2'."""
    return (squared_difference(v1.x, v2.x) + squared_difference(v1.y, v2.y) +
            squared_difference(v1.z, v2.z))


def vecs_to_tensor(v: Vecs  # shape (...)
                   ) -> paddle.Tensor:  # shape(..., 3)
    """Converts 'v' to tensor with shape 3, inverse of 'vecs_from_tensor'."""
    return v.translation


================================================
FILE: ppfleetx/models/protein_folding/residue_constants.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Constants used in AlphaFold."""

import collections
import functools
import os
from typing import List, Mapping, Tuple

import numpy as np
import tree

# Internal import (35fd).

# Distance from one CA to next CA [trans configuration: omega = 180].
ca_ca = 3.80209737096

# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
# chi angles so their chi angle lists are empty.
chi_angles_atoms = {
    'ALA': [],
    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
    'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
            ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],
    'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
    'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
    'CYS': [['N', 'CA', 'CB', 'SG']],
    'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
            ['CB', 'CG', 'CD', 'OE1']],
    'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
            ['CB', 'CG', 'CD', 'OE1']],
    'GLY': [],
    'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
    'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
    'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
    'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
            ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],
    'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],
            ['CB', 'CG', 'SD', 'CE']],
    'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
    'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
    'SER': [['N', 'CA', 'CB', 'OG']],
    'THR': [['N', 'CA', 'CB', 'OG1']],
    'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
    'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
    'VAL': [['N', 'CA', 'CB', 'CG1']],
}

# If chi angles given in fixed-length array, this matrix determines how to mask
# them for each AA type. The order is as per restype_order (see below).
chi_angles_mask = [
    [0.0, 0.0, 0.0, 0.0],  # ALA
    [1.0, 1.0, 1.0, 1.0],  # ARG
    [1.0, 1.0, 0.0, 0.0],  # ASN
    [1.0, 1.0, 0.0, 0.0],  # ASP
    [1.0, 0.0, 0.0, 0.0],  # CYS
    [1.0, 1.0, 1.0, 0.0],  # GLN
    [1.0, 1.0, 1.0, 0.0],  # GLU
    [0.0, 0.0, 0.0, 0.0],  # GLY
    [1.0, 1.0, 0.0, 0.0],  # HIS
    [1.0, 1.0, 0.0, 0.0],  # ILE
    [1.0, 1.0, 0.0, 0.0],  # LEU
    [1.0, 1.0, 1.0, 1.0],  # LYS
    [1.0, 1.0, 1.0, 0.0],  # MET
    [1.0, 1.0, 0.0, 0.0],  # PHE
    [1.0, 1.0, 0.0, 0.0],  # PRO
    [1.0, 0.0, 0.0, 0.0],  # SER
    [1.0, 0.0, 0.0, 0.0],  # THR
    [1.0, 1.0, 0.0, 0.0],  # TRP
    [1.0, 1.0, 0.0, 0.0],  # TYR
    [1.0, 0.0, 0.0, 0.0],  # VAL
]

# The following chi angles are pi periodic: they can be rotated by a multiple
# of pi without affecting the structure.
chi_pi_periodic = [
    [0.0, 0.0, 0.0, 0.0],  # ALA
    [0.0, 0.0, 0.0, 0.0],  # ARG
    [0.0, 0.0, 0.0, 0.0],  # ASN
    [0.0, 1.0, 0.0, 0.0],  # ASP
    [0.0, 0.0, 0.0, 0.0],  # CYS
    [0.0, 0.0, 0.0, 0.0],  # GLN
    [0.0, 0.0, 1.0, 0.0],  # GLU
    [0.0, 0.0, 0.0, 0.0],  # GLY
    [0.0, 0.0, 0.0, 0.0],  # HIS
    [0.0, 0.0, 0.0, 0.0],  # ILE
    [0.0, 0.0, 0.0, 0.0],  # LEU
    [0.0, 0.0, 0.0, 0.0],  # LYS
    [0.0, 0.0, 0.0, 0.0],  # MET
    [0.0, 1.0, 0.0, 0.0],  # PHE
    [0.0, 0.0, 0.0, 0.0],  # PRO
    [0.0, 0.0, 0.0, 0.0],  # SER
    [0.0, 0.0, 0.0, 0.0],  # THR
    [0.0, 0.0, 0.0, 0.0],  # TRP
    [0.0, 1.0, 0.0, 0.0],  # TYR
    [0.0, 0.0, 0.0, 0.0],  # VAL
    [0.0, 0.0, 0.0, 0.0],  # UNK
]

# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
# psi and chi angles:
# 0: 'backbone group',
# 1: 'pre-omega-group', (empty)
# 2: 'phi-group', (currently empty, because it defines only hydrogens)
# 3: 'psi-group',
# 4,5,6,7: 'chi1,2,3,4-group'
# The atom positions are relative to the axis-end-atom of the corresponding
# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
# is defined such that the dihedral-angle-definiting atom (the last entry in
# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
# format: [atomname, group_idx, rel_position]
rigid_group_atom_positions = {
    'ALA': [
        ['N', 0, (-0.525, 1.363, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.526, -0.000, -0.000)],
        ['CB', 0, (-0.529, -0.774, -1.205)],
        ['O', 3, (0.627, 1.062, 0.000)],
    ],
    'ARG': [
        ['N', 0, (-0.524, 1.362, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.525, -0.000, -0.000)],
        ['CB', 0, (-0.524, -0.778, -1.209)],
        ['O', 3, (0.626, 1.062, 0.000)],
        ['CG', 4, (0.616, 1.390, -0.000)],
        ['CD', 5, (0.564, 1.414, 0.000)],
        ['NE', 6, (0.539, 1.357, -0.000)],
        ['NH1', 7, (0.206, 2.301, 0.000)],
        ['NH2', 7, (2.078, 0.978, -0.000)],
        ['CZ', 7, (0.758, 1.093, -0.000)],
    ],
    'ASN': [
        ['N', 0, (-0.536, 1.357, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.526, -0.000, -0.000)],
        ['CB', 0, (-0.531, -0.787, -1.200)],
        ['O', 3, (0.625, 1.062, 0.000)],
        ['CG', 4, (0.584, 1.399, 0.000)],
        ['ND2', 5, (0.593, -1.188, 0.001)],
        ['OD1', 5, (0.633, 1.059, 0.000)],
    ],
    'ASP': [
        ['N', 0, (-0.525, 1.362, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.527, 0.000, -0.000)],
        ['CB', 0, (-0.526, -0.778, -1.208)],
        ['O', 3, (0.626, 1.062, -0.000)],
        ['CG', 4, (0.593, 1.398, -0.000)],
        ['OD1', 5, (0.610, 1.091, 0.000)],
        ['OD2', 5, (0.592, -1.101, -0.003)],
    ],
    'CYS': [
        ['N', 0, (-0.522, 1.362, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.524, 0.000, 0.000)],
        ['CB', 0, (-0.519, -0.773, -1.212)],
        ['O', 3, (0.625, 1.062, -0.000)],
        ['SG', 4, (0.728, 1.653, 0.000)],
    ],
    'GLN': [
        ['N', 0, (-0.526, 1.361, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.526, 0.000, 0.000)],
        ['CB', 0, (-0.525, -0.779, -1.207)],
        ['O', 3, (0.626, 1.062, -0.000)],
        ['CG', 4, (0.615, 1.393, 0.000)],
        ['CD', 5, (0.587, 1.399, -0.000)],
        ['NE2', 6, (0.593, -1.189, -0.001)],
        ['OE1', 6, (0.634, 1.060, 0.000)],
    ],
    'GLU': [
        ['N', 0, (-0.528, 1.361, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.526, -0.000, -0.000)],
        ['CB', 0, (-0.526, -0.781, -1.207)],
        ['O', 3, (0.626, 1.062, 0.000)],
        ['CG', 4, (0.615, 1.392, 0.000)],
        ['CD', 5, (0.600, 1.397, 0.000)],
        ['OE1', 6, (0.607, 1.095, -0.000)],
        ['OE2', 6, (0.589, -1.104, -0.001)],
    ],
    'GLY': [
        ['N', 0, (-0.572, 1.337, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.517, -0.000, -0.000)],
        ['O', 3, (0.626, 1.062, -0.000)],
    ],
    'HIS': [
        ['N', 0, (-0.527, 1.360, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.525, 0.000, 0.000)],
        ['CB', 0, (-0.525, -0.778, -1.208)],
        ['O', 3, (0.625, 1.063, 0.000)],
        ['CG', 4, (0.600, 1.370, -0.000)],
        ['CD2', 5, (0.889, -1.021, 0.003)],
        ['ND1', 5, (0.744, 1.160, -0.000)],
        ['CE1', 5, (2.030, 0.851, 0.002)],
        ['NE2', 5, (2.145, -0.466, 0.004)],
    ],
    'ILE': [
        ['N', 0, (-0.493, 1.373, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.527, -0.000, -0.000)],
        ['CB', 0, (-0.536, -0.793, -1.213)],
        ['O', 3, (0.627, 1.062, -0.000)],
        ['CG1', 4, (0.534, 1.437, -0.000)],
        ['CG2', 4, (0.540, -0.785, -1.199)],
        ['CD1', 5, (0.619, 1.391, 0.000)],
    ],
    'LEU': [
        ['N', 0, (-0.520, 1.363, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.525, -0.000, -0.000)],
        ['CB', 0, (-0.522, -0.773, -1.214)],
        ['O', 3, (0.625, 1.063, -0.000)],
        ['CG', 4, (0.678, 1.371, 0.000)],
        ['CD1', 5, (0.530, 1.430, -0.000)],
        ['CD2', 5, (0.535, -0.774, 1.200)],
    ],
    'LYS': [
        ['N', 0, (-0.526, 1.362, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.526, 0.000, 0.000)],
        ['CB', 0, (-0.524, -0.778, -1.208)],
        ['O', 3, (0.626, 1.062, -0.000)],
        ['CG', 4, (0.619, 1.390, 0.000)],
        ['CD', 5, (0.559, 1.417, 0.000)],
        ['CE', 6, (0.560, 1.416, 0.000)],
        ['NZ', 7, (0.554, 1.387, 0.000)],
    ],
    'MET': [
        ['N', 0, (-0.521, 1.364, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.525, 0.000, 0.000)],
        ['CB', 0, (-0.523, -0.776, -1.210)],
        ['O', 3, (0.625, 1.062, -0.000)],
        ['CG', 4, (0.613, 1.391, -0.000)],
        ['SD', 5, (0.703, 1.695, 0.000)],
        ['CE', 6, (0.320, 1.786, -0.000)],
    ],
    'PHE': [
        ['N', 0, (-0.518, 1.363, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.524, 0.000, -0.000)],
        ['CB', 0, (-0.525, -0.776, -1.212)],
        ['O', 3, (0.626, 1.062, -0.000)],
        ['CG', 4, (0.607, 1.377, 0.000)],
        ['CD1', 5, (0.709, 1.195, -0.000)],
        ['CD2', 5, (0.706, -1.196, 0.000)],
        ['CE1', 5, (2.102, 1.198, -0.000)],
        ['CE2', 5, (2.098, -1.201, -0.000)],
        ['CZ', 5, (2.794, -0.003, -0.001)],
    ],
    'PRO': [
        ['N', 0, (-0.566, 1.351, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.527, -0.000, 0.000)],
        ['CB', 0, (-0.546, -0.611, -1.293)],
        ['O', 3, (0.621, 1.066, 0.000)],
        ['CG', 4, (0.382, 1.445, 0.0)],
        # ['CD', 5, (0.427, 1.440, 0.0)],
        ['CD', 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger
    ],
    'SER': [
        ['N', 0, (-0.529, 1.360, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.525, -0.000, -0.000)],
        ['CB', 0, (-0.518, -0.777, -1.211)],
        ['O', 3, (0.626, 1.062, -0.000)],
        ['OG', 4, (0.503, 1.325, 0.000)],
    ],
    'THR': [
        ['N', 0, (-0.517, 1.364, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.526, 0.000, -0.000)],
        ['CB', 0, (-0.516, -0.793, -1.215)],
        ['O', 3, (0.626, 1.062, 0.000)],
        ['CG2', 4, (0.550, -0.718, -1.228)],
        ['OG1', 4, (0.472, 1.353, 0.000)],
    ],
    'TRP': [
        ['N', 0, (-0.521, 1.363, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.525, -0.000, 0.000)],
        ['CB', 0, (-0.523, -0.776, -1.212)],
        ['O', 3, (0.627, 1.062, 0.000)],
        ['CG', 4, (0.609, 1.370, -0.000)],
        ['CD1', 5, (0.824, 1.091, 0.000)],
        ['CD2', 5, (0.854, -1.148, -0.005)],
        ['CE2', 5, (2.186, -0.678, -0.007)],
        ['CE3', 5, (0.622, -2.530, -0.007)],
        ['NE1', 5, (2.140, 0.690, -0.004)],
        ['CH2', 5, (3.028, -2.890, -0.013)],
        ['CZ2', 5, (3.283, -1.543, -0.011)],
        ['CZ3', 5, (1.715, -3.389, -0.011)],
    ],
    'TYR': [
        ['N', 0, (-0.522, 1.362, 0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.524, -0.000, -0.000)],
        ['CB', 0, (-0.522, -0.776, -1.213)],
        ['O', 3, (0.627, 1.062, -0.000)],
        ['CG', 4, (0.607, 1.382, -0.000)],
        ['CD1', 5, (0.716, 1.195, -0.000)],
        ['CD2', 5, (0.713, -1.194, -0.001)],
        ['CE1', 5, (2.107, 1.200, -0.002)],
        ['CE2', 5, (2.104, -1.201, -0.003)],
        ['OH', 5, (4.168, -0.002, -0.005)],
        ['CZ', 5, (2.791, -0.001, -0.003)],
    ],
    'VAL': [
        ['N', 0, (-0.494, 1.373, -0.000)],
        ['CA', 0, (0.000, 0.000, 0.000)],
        ['C', 0, (1.527, -0.000, -0.000)],
        ['CB', 0, (-0.533, -0.795, -1.213)],
        ['O', 3, (0.627, 1.062, -0.000)],
        ['CG1', 4, (0.540, 1.429, -0.000)],
        ['CG2', 4, (0.533, -0.776, 1.203)],
    ],
}

# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
residue_atoms = {
    'ALA': ['C', 'CA', 'CB', 'N', 'O'],
    'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
    'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
    'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
    'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
    'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
    'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
    'GLY': ['C', 'CA', 'N', 'O'],
    'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
    'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
    'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
    'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
    'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
    'PHE':
    ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
    'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
    'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
    'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
    'TRP': [
        'C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2',
        'N', 'NE1', 'O'
    ],
    'TYR':
    ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O', 'OH'],
    'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']
}

# Naming swaps for ambiguous atom names.
# Due to symmetries in the amino acids the naming of atoms is ambiguous in
# 4 of the 20 amino acids.
# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
# in LEU, VAL and ARG can be resolved by using the 3d constellations of
# the 'ambiguous' atoms and their neighbours)
residue_atom_renaming_swaps = {
    'ASP': {
        'OD1': 'OD2'
    },
    'GLU': {
        'OE1': 'OE2'
    },
    'PHE': {
        'CD1': 'CD2',
        'CE1': 'CE2'
    },
    'TYR': {
        'CD1': 'CD2',
        'CE1': 'CE2'
    },
}

# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
van_der_waals_radius = {
    'C': 1.7,
    'N': 1.55,
    'O': 1.52,
    'S': 1.8,
}

Bond = collections.namedtuple(
    'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])
BondAngle = collections.namedtuple(
    'BondAngle',
    ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])


@functools.lru_cache(maxsize=None)
def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]], Mapping[
        str, List[Bond]], Mapping[str, List[BondAngle]]]:
    """Load stereo_chemical_props.txt into a nice structure.

  Load literature values for bond lengths and bond angles and translate
  bond angles into the length of the opposite edge of the triangle
  ("residue_virtual_bonds").

  Returns:
    residue_bonds: Dict that maps resname -> list of Bond tuples.
    residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.
    residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.
  """
    stereo_chemical_props_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'stereo_chemical_props.txt')
    with open(stereo_chemical_props_path, 'rt') as f:
        stereo_chemical_props = f.read()
    lines_iter = iter(stereo_chemical_props.splitlines())
    # Load bond lengths.
    residue_bonds = {}
    next(lines_iter)  # Skip header line.
    for line in lines_iter:
        if line.strip() == '-':
            break
        bond, resname, length, stddev = line.split()
        atom1, atom2 = bond.split('-')
        if resname not in residue_bonds:
            residue_bonds[resname] = []
        residue_bonds[resname].append(
            Bond(atom1, atom2, float(length), float(stddev)))
    residue_bonds['UNK'] = []

    # Load bond angles.
    residue_bond_angles = {}
    next(lines_iter)  # Skip empty line.
    next(lines_iter)  # Skip header line.
    for line in lines_iter:
        if line.strip() == '-':
            break
        bond, resname, angle_degree, stddev_degree = line.split()
        atom1, atom2, atom3 = bond.split('-')
        if resname not in residue_bond_angles:
            residue_bond_angles[resname] = []
        residue_bond_angles[resname].append(
            BondAngle(atom1, atom2, atom3,
                      float(angle_degree) / 180. * np.pi,
                      float(stddev_degree) / 180. * np.pi))
    residue_bond_angles['UNK'] = []

    def make_bond_key(atom1_name, atom2_name):
        """Unique key to lookup bonds."""
        return '-'.join(sorted([atom1_name, atom2_name]))

    # Translate bond angles into distances ("virtual bonds").
    residue_virtual_bonds = {}
    for resname, bond_angles in residue_bond_angles.items():
        # Create a fast lookup dict for bond lengths.
        bond_cache = {}
        for b in residue_bonds[resname]:
            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
        residue_virtual_bonds[resname] = []
        for ba in bond_angles:
            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]

            # Compute distance between atom1 and atom3 using the law of cosines
            # c^2 = a^2 + b^2 - 2ab*cos(gamma).
            gamma = ba.angle_rad
            length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 *
                             bond1.length * bond2.length * np.cos(gamma))

            # Propagation of uncertainty assuming uncorrelated errors.
            dl_outer = 0.5 / length
            dl_dgamma = (2 * bond1.length * bond2.length *
                         np.sin(gamma)) * dl_outer
            dl_db1 = (
                2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
            dl_db2 = (
                2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
            stddev = np.sqrt((dl_dgamma * ba.stddev)**2 + (
                dl_db1 * bond1.stddev)**2 + (dl_db2 * bond2.stddev)**2)
            residue_virtual_bonds[resname].append(
                Bond(ba.atom1_name, ba.atom3name, length, stddev))

    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)


# Between-residue bond lengths for general bonds (first element) and for Proline
# (second element).
between_res_bond_length_c_n = [1.329, 1.341]
between_res_bond_length_stddev_c_n = [0.014, 0.016]

# Between-residue cos_angles.
between_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315
between_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995

# This mapping is used when we need to store atom data in a format that requires
# fixed atom data size for every residue (e.g. a numpy array).
atom_types = [
    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
    'CZ3', 'NZ', 'OXT'
]
atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
atom_type_num = len(atom_types)  # := 37.

# A compact atom encoding with 14 columns
# pylint: disable=line-too-long
# pylint: disable=bad-whitespace
restype_name_to_atom14_names = {
    'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''],
    'ARG': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2', '',
        '', ''
    ],
    'ASN':
    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''],
    'ASP':
    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''],
    'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''],
    'GLN':
    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''],
    'GLU': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', ''
    ],
    'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],
    'HIS': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2', '', '',
        '', ''
    ],
    'ILE': [
        'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', ''
    ],
    'LEU': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', ''
    ],
    'LYS': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', ''
    ],
    'MET': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', ''
    ],
    'PHE': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', '',
        '', ''
    ],
    'PRO': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', ''
    ],
    'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''],
    'THR': [
        'N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', ''
    ],
    'TRP': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3',
        'CZ2', 'CZ3', 'CH2'
    ],
    'TYR': [
        'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ',
        'OH', '', ''
    ],
    'VAL': [
        'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''
    ],
    'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''],
}
# pylint: enable=line-too-long
# pylint: enable=bad-whitespace

# This is the standard residue order when coding AA type as a number.
# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
restypes = [
    'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',
    'S', 'T', 'W', 'Y', 'V'
]
restype_order = {restype: i for i, restype in enumerate(restypes)}
restype_num = len(restypes)  # := 20.
unk_restype_index = restype_num  # Catch-all index for unknown restypes.

restypes_with_x = restypes + ['X']
restype_order_with_x = {
    restype: i
    for i, restype in enumerate(restypes_with_x)
}


def sequence_to_onehot(sequence: str,
                       mapping: Mapping[str, int],
                       map_unknown_to_x: bool=False) -> np.ndarray:
    """Maps the given sequence into a one-hot encoded matrix.

  Args:
    sequence: An amino acid sequence.
    mapping: A dictionary mapping amino acids to integers.
    map_unknown_to_x: If True, any amino acid that is not in the mapping will be
      mapped to the unknown amino acid 'X'. If the mapping doesn't contain
      amino acid 'X', an error will be thrown. If False, any amino acid not in
      the mapping will throw an error.

  Returns:
    A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
    the sequence.

  Raises:
    ValueError: If the mapping doesn't contain values from 0 to
      num_unique_aas - 1 without any gaps.
  """
    num_entries = max(mapping.values()) + 1

    if sorted(set(mapping.values())) != list(range(num_entries)):
        raise ValueError(
            'The mapping must have values from 0 to num_unique_aas-1 '
            'without any gaps. Got: %s' % sorted(mapping.values()))

    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)

    for aa_index, aa_type in enumerate(sequence):
        if map_unknown_to_x:
            if aa_type.isalpha() and aa_type.isupper():
                aa_id = mapping.get(aa_type, mapping['X'])
            else:
                raise ValueError(
                    f'Invalid character in the sequence: {aa_type}')
        else:
            aa_id = mapping[aa_type]
        one_hot_arr[aa_index, aa_id] = 1

    return one_hot_arr


restype_1to3 = {
    'A': 'ALA',
    'R': 'ARG',
    'N': 'ASN',
    'D': 'ASP',
    'C': 'CYS',
    'Q': 'GLN',
    'E': 'GLU',
    'G': 'GLY',
    'H': 'HIS',
    'I': 'ILE',
    'L': 'LEU',
    'K': 'LYS',
    'M': 'MET',
    'F': 'PHE',
    'P': 'PRO',
    'S': 'SER',
    'T': 'THR',
    'W': 'TRP',
    'Y': 'TYR',
    'V': 'VAL',
}

# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
# many more, and less common, three letter names as keys and maps many of these
# to the same one letter name (including 'X' and 'U' which we don't use here).
restype_3to1 = {v: k for k, v in restype_1to3.items()}

# Define a restype name for all unknown residues.
unk_restype = 'UNK'

resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
resname_to_idx = {resname: i for i, resname in enumerate(resnames)}

# The mapping here uses hhblits convention, so that B is mapped to D, J and O
# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
# remaining 20 amino acids are kept in alphabetical order.
# There are 2 non-amino acid codes, X (representing any amino acid) and
# "-" representing a missing amino acid in an alignment.  The id for these
# codes is put at the end (20 and 21) so that they can easily be ignored if
# desired.
HHBLITS_AA_TO_ID = {
    'A': 0,
    'B': 2,
    'C': 1,
    'D': 2,
    'E': 3,
    'F': 4,
    'G': 5,
    'H': 6,
    'I': 7,
    'J': 20,
    'K': 8,
    'L': 9,
    'M': 10,
    'N': 11,
    'O': 20,
    'P': 12,
    'Q': 13,
    'R': 14,
    'S': 15,
    'T': 16,
    'U': 1,
    'V': 17,
    'W': 18,
    'X': 20,
    'Y': 19,
    'Z': 3,
    '-': 21,
}

# Partial inversion of HHBLITS_AA_TO_ID.
ID_TO_HHBLITS_AA = {
    0: 'A',
    1: 'C',  # Also U.
    2: 'D',  # Also B.
    3: 'E',  # Also Z.
    4: 'F',
    5: 'G',
    6: 'H',
    7: 'I',
    8: 'K',
    9: 'L',
    10: 'M',
    11: 'N',
    12: 'P',
    13: 'Q',
    14: 'R',
    15: 'S',
    16: 'T',
    17: 'V',
    18: 'W',
    19: 'Y',
    20: 'X',  # Includes J and O.
    21: '-',
}

restypes_with_x_and_gap = restypes + ['X', '-']
MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
    for i in range(len(restypes_with_x_and_gap)))


def _make_standard_atom_mask() -> np.ndarray:
    """Returns [num_res_types, num_atom_types] mask array."""
    # +1 to account for unknown (all 0s).
    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
    for restype, restype_letter in enumerate(restypes):
        restype_name = restype_1to3[restype_letter]
        atom_names = residue_atoms[restype_name]
        for atom_name in atom_names:
            atom_type = atom_order[atom_name]
            mask[restype, atom_type] = 1
    return mask


STANDARD_ATOM_MASK = _make_standard_atom_mask()


# A one hot representation for the first and second atoms defining the axis
# of rotation for each chi-angle in each residue.
def chi_angle_atom(atom_index: int) -> np.ndarray:
    """Define chi-angle rigid groups via one-hot representations."""
    chi_angles_index = {}
    one_hots = []

    for k, v in chi_angles_atoms.items():
        indices = [atom_types.index(s[atom_index]) for s in v]
        indices.extend([-1] * (4 - len(indices)))
        chi_angles_index[k] = indices

    for r in restypes:
        res3 = restype_1to3[r]
        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
        one_hots.append(one_hot)

    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
    one_hot = np.stack(one_hots, axis=0)
    one_hot = np.transpose(one_hot, [0, 2, 1])

    return one_hot


chi_atom_1_one_hot = chi_angle_atom(1)
chi_atom_2_one_hot = chi_angle_atom(2)

# An array like chi_angles_atoms but using indices rather than names.
chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
chi_angles_atom_indices = tree.map_structure(
    lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)
chi_angles_atom_indices = np.array([
    chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
    for chi_atoms in chi_angles_atom_indices
])

# Mapping from (res_name, atom_name) pairs to the atom's chi group index
# and atom index within that group.
chi_groups_for_atom = collections.defaultdict(list)
for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
        for atom_i, atom in enumerate(chi_group):
            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
chi_groups_for_atom = dict(chi_groups_for_atom)


def _make_rigid_transformation_4x4(ex, ey, translation):
    """Create a rigid 4x4 transformation matrix from two axes and transl."""
    # Normalize ex.
    ex_normalized = ex / np.linalg.norm(ex)

    # make ey perpendicular to ex
    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
    ey_normalized /= np.linalg.norm(ey_normalized)

    # compute ez as cross product
    eznorm = np.cross(ex_normalized, ey_normalized)
    m = np.stack(
        [ex_normalized, ey_normalized, eznorm, translation]).transpose()
    m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)
    return m


# create an array with (restype, atomtype) --> rigid_group_idx
# and an array with (restype, atomtype, coord) for the atom positions
# and compute affine transformation matrices (4,4) from one rigid group to the
# previous group
restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int)
restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int)
restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)


def _make_rigid_group_constants():
    """Fill the arrays above."""
    for restype, restype_letter in enumerate(restypes):
        resname = restype_1to3[restype_letter]
        for atomname, group_idx, atom_position in rigid_group_atom_positions[
                resname]:
            atomtype = atom_order[atomname]
            restype_atom37_to_rigid_group[restype, atomtype] = group_idx
            restype_atom37_mask[restype, atomtype] = 1
            restype_atom37_rigid_group_positions[restype,
                                                 atomtype, :] = atom_position

            atom14idx = restype_name_to_atom14_names[resname].index(atomname)
            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
            restype_atom14_mask[restype, atom14idx] = 1
            restype_atom14_rigid_group_positions[restype,
                                                 atom14idx, :] = atom_position

    for restype, restype_letter in enumerate(restypes):
        resname = restype_1to3[restype_letter]
        atom_positions = {
            name: np.array(pos)
            for name, _, pos in rigid_group_atom_positions[resname]
        }

        # backbone to backbone is the identity transform
        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)

        # pre-omega-frame to backbone (currently dummy identity matrix)
        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)

        # phi-frame to backbone
        mat = _make_rigid_transformation_4x4(
            ex=atom_positions['N'] - atom_positions['CA'],
            ey=np.array([1., 0., 0.]),
            translation=atom_positions['N'])
        restype_rigid_group_default_frame[restype, 2, :, :] = mat

        # psi-frame to backbone
        mat = _make_rigid_transformation_4x4(
            ex=atom_positions['C'] - atom_positions['CA'],
            ey=atom_positions['CA'] - atom_positions['N'],
            translation=atom_positions['C'])
        restype_rigid_group_default_frame[restype, 3, :, :] = mat

        # chi1-frame to backbone
        if chi_angles_mask[restype][0]:
            base_atom_names = chi_angles_atoms[resname][0]
            base_atom_positions = [
                atom_positions[name] for name in base_atom_names
            ]
            mat = _make_rigid_transformation_4x4(
                ex=base_atom_positions[2] - base_atom_positions[1],
                ey=base_atom_positions[0] - base_atom_positions[1],
                translation=base_atom_positions[2])
            restype_rigid_group_default_frame[restype, 4, :, :] = mat

        # chi2-frame to chi1-frame
        # chi3-frame to chi2-frame
        # chi4-frame to chi3-frame
        # luckily all rotation axes for the next frame start at (0,0,0) of the
        # previous frame
        for chi_idx in range(1, 4):
            if chi_angles_mask[restype][chi_idx]:
                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
                axis_end_atom_position = atom_positions[axis_end_atom_name]
                mat = _make_rigid_transformation_4x4(
                    ex=axis_end_atom_position,
                    ey=np.array([-1., 0., 0.]),
                    translation=axis_end_atom_position)
                restype_rigid_group_default_frame[restype, 4 +
                                                  chi_idx, :, :] = mat


_make_rigid_group_constants()


def make_atom14_dists_bounds(overlap_tolerance=1.5,
                             bond_length_tolerance_factor=15):
    """compute upper and lower bounds for bonds to assess violations."""
    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
    for restype, restype_letter in enumerate(restypes):
        resname = restype_1to3[restype_letter]
        atom_list = restype_name_to_atom14_names[resname]

        # create lower and upper bounds for clashes
        for atom1_idx, atom1_name in enumerate(atom_list):
            if not atom1_name:
                continue
            atom1_radius = van_der_waals_radius[atom1_name[0]]
            for atom2_idx, atom2_name in enumerate(atom_list):
                if (not atom2_name) or atom1_idx == atom2_idx:
                    continue
                atom2_radius = van_der_waals_radius[atom2_name[0]]
                lower = atom1_radius + atom2_radius - overlap_tolerance
                upper = 1e10
                restype_atom14_bond_lower_bound[restype, atom1_idx,
                                                atom2_idx] = lower
                restype_atom14_bond_lower_bound[restype, atom2_idx,
                                                atom1_idx] = lower
                restype_atom14_bond_upper_bound[restype, atom1_idx,
                                                atom2_idx] = upper
                restype_atom14_bond_upper_bound[restype, atom2_idx,
                                                atom1_idx] = upper

        # overwrite lower and upper bounds for bonds and angles
        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
            atom1_idx = atom_list.index(b.atom1_name)
            atom2_idx = atom_list.index(b.atom2_name)
            lower = b.length - bond_length_tolerance_factor * b.stddev
            upper = b.length + bond_length_tolerance_factor * b.stddev
            restype_atom14_bond_lower_bound[restype, atom1_idx,
                                            atom2_idx] = lower
            restype_atom14_bond_lower_bound[restype, atom2_idx,
                                            atom1_idx] = lower
            restype_atom14_bond_upper_bound[restype, atom1_idx,
                                            atom2_idx] = upper
            restype_atom14_bond_upper_bound[restype, atom2_idx,
                                            atom1_idx] = upper
            restype_atom14_bond_stddev[restype, atom1_idx,
                                       atom2_idx] = b.stddev
            restype_atom14_bond_stddev[restype, atom2_idx,
                                       atom1_idx] = b.stddev
    return {
        'lower_bound': restype_atom14_bond_lower_bound,  # shape (21,14,14)
        'upper_bound': restype_atom14_bond_upper_bound,  # shape (21,14,14)
        'stddev': restype_atom14_bond_stddev,  # shape (21,14,14)
    }


================================================
FILE: ppfleetx/models/protein_folding/template.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn

from ppfleetx.distributed.protein_folding import dap

from .attentions import (
    Attention,
    TriangleMultiplication,
    TriangleAttention, )

from .common import (
    Transition,
    Dropout,
    recompute_wrapper,
    dgram_from_positions,
    subbatch, )

from . import (residue_constants, )
from . import (quat_affine, )


class TemplatePair(nn.Layer):
    """Pair processing for the templates.

    Jumper et al. (2021) Suppl. Alg. 16 "TemplatePairStack" lines 2-6
    """

    def __init__(self, channel_num, config, global_config):
        super(TemplatePair, self).__init__()
        self.config = config
        self.global_config = global_config

        channel_num = {}
        channel_num[
            'pair_channel'] = self.config.triangle_attention_ending_node.value_dim

        self.triangle_attention_starting_node = TriangleAttention(
            channel_num,
            self.config.triangle_attention_starting_node,
            self.global_config,
            name='triangle_attention_starting_node')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_attention_starting_node)
        self.triangle_starting_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.triangle_attention_ending_node = TriangleAttention(
            channel_num,
            self.config.triangle_attention_ending_node,
            self.global_config,
            name='triangle_attention_ending_node')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_attention_ending_node)
        self.triangle_ending_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.triangle_multiplication_outgoing = TriangleMultiplication(
            channel_num,
            self.config.triangle_multiplication_outgoing,
            self.global_config,
            name='triangle_multiplication_outgoing')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_multiplication_outgoing)
        self.triangle_outgoing_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.triangle_multiplication_incoming = TriangleMultiplication(
            channel_num,
            self.config.triangle_multiplication_incoming,
            self.global_config,
            name='triangle_multiplication_incoming')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.triangle_multiplication_incoming)
        self.triangle_incoming_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

        self.pair_transition = Transition(
            channel_num,
            self.config.pair_transition,
            self.global_config,
            is_extra_msa=False,
            transition_type='pair_transition')

        dropout_rate, dropout_axis = self._parse_dropout_params(
            self.pair_transition)
        self.pair_transition_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \
            if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis)

    def _parse_dropout_params(self, module):
        dropout_rate = 0.0 if self.global_config.deterministic else \
            module.config.dropout_rate
        dropout_axis = None
        if module.config.shared_dropout:
            dropout_axis = {
                'per_row': [0, 2, 3],
                'per_column': [0, 1, 3],
            }[module.config.orientation]

        return dropout_rate, dropout_axis

    def forward(self, pair_act, pair_mask):
        """Builds one block of TemplatePair module.

        Arguments:
        pair_act: Pair activations for single template, shape [batch, N_res, N_res, c_t].
        pair_mask: Pair mask, shape [batch, N_res, N_res].

        Returns:
        Updated pair_act, shape [batch, N_res, N_res, c_t].
        """

        pair_mask_row = dap.scatter(pair_mask, axis=1)
        pair_mask_col = dap.scatter(pair_mask, axis=2)

        residual = self.triangle_attention_starting_node(pair_act,
                                                         pair_mask_row)
        residual = self.triangle_starting_dropout(residual)
        pair_act = pair_act + residual

        pair_act = dap.row_to_col(pair_act)
        residual = self.triangle_attention_ending_node(pair_act, pair_mask_col)
        residual = self.triangle_ending_dropout(residual)
        pair_act = pair_act + residual

        pair_act = dap.col_to_row(pair_act)
        residual = self.triangle_multiplication_outgoing(pair_act,
                                                         pair_mask_row)
        residual = self.triangle_outgoing_dropout(residual)
        pair_act = pair_act + residual

        pair_act = dap.row_to_col(pair_act)
        residual = self.triangle_multiplication_incoming(pair_act,
                                                         pair_mask_col)
        residual = self.triangle_incoming_dropout(residual)
        pair_act = pair_act + residual

        residual = self.pair_transition(pair_act, pair_mask)
        residual = self.pair_transition_dropout(residual)
        pair_act = pair_act + residual

        pair_act = dap.col_to_row(pair_act)

        return pair_act


class SingleTemplateEmbedding(nn.Layer):
    """Embeds a single template.

    Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9+11
    """

    def __init__(self, channel_num, config, global_config):
        super(SingleTemplateEmbedding, self).__init__()
        self.config = config
        self.channel_num = channel_num
        self.global_config = global_config

        Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear

        self.embedding2d = Linear(channel_num['template_pair'],
                                  self.config.template_pair_stack.
                                  triangle_attention_ending_node.value_dim)

        self.template_pair_stack = nn.LayerList()
        for _ in range(self.config.template_pair_stack.num_block):
            self.template_pair_stack.append(
                TemplatePair(self.channel_num, self.config.template_pair_stack,
                             self.global_config))

        self.output_layer_norm = nn.LayerNorm(self.config.attention.key_dim)

    def forward(self, query_embedding, batch, mask_2d):
        """Build the single template embedding.

        Arguments:
            query_embedding: Query pair representation, shape [batch, N_res, N_res, c_z].
            batch: A batch of template features (note the template dimension has been
                stripped out as this module only runs over a single template).
            mask_2d: Padding mask (Note: this doesn't care if a template exists,
                unlike the template_pseudo_beta_mask).

        Returns:
            A template embedding [N_res, N_res, c_z].
        """
        assert mask_2d.dtype == query_embedding.dtype
        dtype = query_embedding.dtype
        num_res = batch['template_aatype'].shape[1]
        template_mask = batch['template_pseudo_beta_mask']
        # template_mask[..., None] * template_mask[..., None, :]
        template_mask_2d = template_mask.unsqueeze(
            axis=-1) * template_mask.unsqueeze(axis=-2)
        template_mask_2d = template_mask_2d.astype(dtype)

        template_dgram = dgram_from_positions(batch['template_pseudo_beta'],
                                              **self.config.dgram_features)
        template_dgram = template_dgram.astype(dtype)

        aatype = nn.functional.one_hot(batch['template_aatype'], 22)
        aatype = aatype.astype(dtype)

        to_concat = [template_dgram, template_mask_2d.unsqueeze(axis=-1)]
        to_concat.append(
            paddle.tile(
                aatype.unsqueeze(axis=-3),  # aatype[..., None, :, :]
                [1, num_res, 1, 1]))
        to_concat.append(
            paddle.tile(
                aatype.unsqueeze(axis=-2),  # aatype[..., None, :]
                [1, 1, num_res, 1]))

        n, ca, c = [residue_constants.atom_order[a] for a in ('N', 'CA', 'C')]
        rot, trans = quat_affine.make_transform_from_reference(
            n_xyz=batch['template_all_atom_positions'][..., n, :],
            ca_xyz=batch['template_all_atom_positions'][..., ca, :],
            c_xyz=batch['template_all_atom_positions'][..., c, :])
        affines = quat_affine.QuatAffine(
            quaternion=quat_affine.rot_to_quat(rot),
            translation=trans,
            rotation=rot)

        points = [
            paddle.unsqueeze(
                x, axis=-2) for x in paddle.unstack(
                    affines.translation, axis=-1)
        ]
        affine_vec = affines.invert_point(points, extra_dims=1)
        inv_distance_scalar = paddle.rsqrt(1e-6 + sum(
            [paddle.square(x) for x in affine_vec]))

        # Backbone affine mask: whether the residue has C, CA, N
        # (the template mask defined above only considers pseudo CB).
        template_mask = (batch['template_all_atom_masks'][..., n] *
                         batch['template_all_atom_masks'][..., ca] *
                         batch['template_all_atom_masks'][..., c])
        # template_mask[..., None] * template_mask[..., None, :]
        template_mask_2d = template_mask.unsqueeze(
            axis=-1) * template_mask.unsqueeze(axis=-2)
        inv_distance_scalar *= template_mask_2d.astype(
            inv_distance_scalar.dtype)

        unit_vector = [(x * inv_distance_scalar).unsqueeze(axis=-1)
                       for x in affine_vec]
        unit_vector = [x.astype(dtype) for x in unit_vector]
        if not self.config.use_template_unit_vector:
            unit_vector = [paddle.zeros_like(x) for x in unit_vector]
        to_concat.extend(unit_vector)

        template_mask_2d = template_mask_2d.astype(dtype)
        to_concat.append(template_mask_2d.unsqueeze(axis=-1))

        act = paddle.concat(to_concat, axis=-1)
        # Mask out non-template regions so we don't get arbitrary values in the
        # distogram for these regions.
        act *= template_mask_2d.unsqueeze(axis=-1)

        act = self.embedding2d(act)

        act = dap.scatter(act, axis=1)
        for idx, pair_encoder in enumerate(self.template_pair_stack):
            act = recompute_wrapper(
                pair_encoder,
                act,
                mask_2d,
                is_recompute=self.training and idx >=
                self.config.template_pair_stack.recompute_start_block_index)
        act = dap.gather(act, axis=1)

        act = self.output_layer_norm(act)
        return act


class TemplateEmbedding(nn.Layer):
    """Embeds a set of templates.

        Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-12
        Jumper et al. (2021) Suppl. Alg. 17 "TemplatePointwiseAttention"
    """

    def __init__(self, channel_num, config, global_config):
        super(TemplateEmbedding, self).__init__()
        self.config = config
        self.global_config = global_config

        self.single_template_embedding = SingleTemplateEmbedding(
            channel_num, config, global_config)
        self.attention = Attention(
            config.attention, global_config, channel_num['pair_channel'],
            config.attention.key_dim, channel_num['pair_channel'])

    def forward(self, query_embedding, template_batch, mask_2d):
        """Build TemplateEmbedding module.

        Arguments:
            query_embedding: Query pair representation, shape [n_batch, N_res, N_res, c_z].
            template_batch: A batch of template features.
            mask_2d: Padding mask (Note: this doesn't care if a template exists,
                unlike the template_pseudo_beta_mask).

        Returns:
            A template embedding [n_batch, N_res, N_res, c_z].
        """

        num_templates = template_batch['template_mask'].shape[1]

        num_channels = (self.config.template_pair_stack.
                        triangle_attention_ending_node.value_dim)

        num_res = query_embedding.shape[1]

        dtype = query_embedding.dtype
        template_mask = template_batch['template_mask']
        template_mask = template_mask.astype(dtype)

        query_channels = query_embedding.shape[-1]

        outs = []
        for i in range(num_templates):
            # By default, num_templates = 4
            batch0 = {
                k: paddle.squeeze(
                    v.slice([1], [i], [i + 1]), axis=1)
                for k, v in template_batch.items()
            }
            outs.append(
                self.single_template_embedding(query_embedding, batch0,
                                               mask_2d))

        template_pair_repr = paddle.stack(outs, axis=1)

        flat_query = paddle.reshape(
            query_embedding, [-1, num_res * num_res, 1, query_channels])
        flat_templates = paddle.reshape(
            paddle.transpose(template_pair_repr, [0, 2, 3, 1, 4]),
            [-1, num_res * num_res, num_templates, num_channels])

        bias = 1e9 * (template_mask[:, None, None, None, :] - 1.)

        if not self.training:
            sb_attn = subbatch(self.attention, [0, 1], [1, 1],
                               self.config.subbatch_size, 1)
            emb = sb_attn(flat_query, flat_templates, bias)

        else:
            emb = self.attention(flat_query, flat_templates, bias)

        emb = paddle.reshape(emb, [-1, num_res, num_res, query_channels])

        # No gradients if no templates.
        emb *= (paddle.sum(template_mask) > 0.).astype(emb.dtype)
        return emb


================================================
FILE: ppfleetx/models/vision_model/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/models/vision_model/factory.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import importlib

from .vit import *
from .loss import *
from .metrics import *
from .resnet import *
from .moco import *
from .layers import *

__all__ = ['build', ]


def build(config):
    if config is None:
        return None
    config = copy.deepcopy(config)
    model_type = config.pop("name")
    mod = importlib.import_module(__name__)
    model = getattr(mod, model_type)(**config)
    return model


================================================
FILE: ppfleetx/models/vision_model/general_classification_module.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import copy
import importlib
from collections import defaultdict
import numpy as np

import paddle
from paddle import LazyGuard
from paddle.static import InputSpec
from ppfleetx.utils.log import logger

from ppfleetx.core.module.basic_module import BasicModule

from .factory import build


class GeneralClsModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        self.model_configs = copy.deepcopy(configs.Model)
        self.model_configs.pop('module')

        # must init before loss function
        super(GeneralClsModule, self).__init__(configs)

        assert 'train' in self.model_configs.loss
        self.loss_fn = build(self.model_configs.loss.train)
        self.eval_loss_fn = None
        if 'eval' in self.model_configs.loss:
            self.eval_loss_fn = build(self.model_configs.loss.eval)

        if 'train' in self.model_configs.metric:
            self.train_metric_fn = build(self.model_configs.metric.train)
        if 'eval' in self.model_configs.metric:
            self.eval_metric_fn = build(self.model_configs.metric.eval)

        self.train_batch_size = None
        self.eval_batch_size = None
        self.best_metric = 0.0
        self.acc_list = []

    def get_model(self):
        if not hasattr(self, 'model') or self.model is None:
            self.model = build(self.model_configs.model)

        return self.model

    def qat_model(self):
        self.quanter = paddleslim.dygraph.quant.QAT(config=self.qat_config)
        self.quanter.quantize(self.model)

    def forward(self, inputs):
        return self.model(inputs)

    def training_step(self, batch):
        inputs, labels = batch

        if self.train_batch_size is None:
            self.train_batch_size = inputs.shape[
                0] * paddle.distributed.get_world_size()

        inputs.stop_gradient = True
        labels.stop_gradient = True

        logits = self(inputs)
        loss = self.loss_fn(logits, labels)

        return loss

    def training_step_end(self, log_dict):
        ips = self.train_batch_size / log_dict['train_cost']
        logger.info(
            "[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec"
            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],
               log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips))

    def validation_step(self, batch):
        inputs, labels = batch

        batch_size = inputs.shape[0]

        inputs.stop_gradient = True
        labels.stop_gradient = True

        logits = self(inputs)
        loss = self.eval_loss_fn(logits, labels)

        if paddle.distributed.get_world_size() > 1:
            label_list = []
            paddle.distributed.all_gather(label_list, labels)
            labels = paddle.concat(label_list, 0)

            pred_list = []
            paddle.distributed.all_gather(pred_list, logits)
            logits = paddle.concat(pred_list, 0)

        if self.eval_batch_size is None:
            self.eval_batch_size = logits.shape[0]

        acc = self.eval_metric_fn(logits, labels)
        self.acc_list.append(acc)
        return loss

    def validation_step_end(self, log_dict):
        ips = self.eval_batch_size / log_dict['eval_cost']
        speed = self.configs['Engine']['logging_freq'] / log_dict['eval_cost']
        logger.info(
            "[eval] epoch: %d, step: [%d/%d], loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec"
            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],
               log_dict['loss'], log_dict['eval_cost'], ips))

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, 3, 224, 224], name="images", dtype='float32')
        ]

    def training_epoch_end(self, log_dict):
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (log_dict['epoch'], log_dict['train_cost']))

    def validation_epoch_end(self, log_dict):
        msg = ''
        if len(self.acc_list) > 0:
            ret = defaultdict(list)

            for item in self.acc_list:
                for key, val in item.items():
                    ret[key].append(val)

            for k, v in ret.items():
                ret[k] = np.mean(v)

            if 'metric' in ret and ret['metric'] > self.best_metric:
                self.best_metric = ret['metric']

            if 'metric' in ret:
                ret['best_metric'] = self.best_metric

            msg = ', '
            msg += ", ".join([f'{k} = {v:.6f}' for k, v in ret.items()])
            self.acc_list.clear()

        logger.info("[Eval] epoch: %d, total time: %.5f sec%s" %
                    (log_dict['epoch'], log_dict['eval_cost'], msg))


class GeneralClsModuleAuto(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        self.model_configs = copy.deepcopy(configs.Model)
        self.model_configs.pop('module')

        # must init before loss function
        super(GeneralClsModuleAuto, self).__init__(configs)

        assert 'loss' in self.model_configs
        self.loss_fn = build(self.model_configs.loss)

        if 'metric' in self.model_configs:
            self.metric_fn = build(self.model_configs.metric)

    def get_model(self):
        with LazyGuard():
            if not hasattr(self, 'model') or self.model is None:
                self.model = build(self.model_configs.model)
        return self.model

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, 3, 224, 224], name="images", dtype='float32')
        ]


================================================
FILE: ppfleetx/models/vision_model/layers/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn

from .mlp import *
from .identity import *


================================================
FILE: ppfleetx/models/vision_model/layers/attention.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from .initializer import xavier_uniform_, zeros_


class ViTAttention(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            xavier_uniform_(m.weight)
            zeros_(m.bias)

    def forward(self, x):
        N, C = x.shape[1:]
        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
        attn = nn.functional.softmax(attn, axis=-1)
        attn = self.attn_drop(attn)

        x = (paddle.matmul(attn, v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


================================================
FILE: ppfleetx/models/vision_model/layers/droppath.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn


def drop_path(x, drop_prob=0., training=False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = paddle.to_tensor(1 - drop_prob)
    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
    if x.dtype == paddle.float16:
        random_tensor = keep_prob + paddle.rand(
            shape, dtype=paddle.float32).astype(x.dtype)
    else:
        random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
    random_tensor = paddle.floor(random_tensor)  # binarize
    output = x.divide(keep_prob) * random_tensor
    return output


class DropPath(nn.Layer):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


================================================
FILE: ppfleetx/models/vision_model/layers/embedding.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn


class ViTPatchEmbed(nn.Layer):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = img_size if isinstance(img_size, tuple) else (img_size,
                                                                 img_size)
        patch_size = patch_size if isinstance(patch_size, tuple) else (
            patch_size, patch_size)
        num_patches = (img_size[1] // patch_size[1]) * \
            (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2D(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."

        x = self.proj(x).flatten(2).transpose((0, 2, 1))
        return x


================================================
FILE: ppfleetx/models/vision_model/layers/identity.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn

__all__ = ['Identity', ]


class Identity(nn.Layer):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, input):
        return input


================================================
FILE: ppfleetx/models/vision_model/layers/initializer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import math
from paddle.nn.initializer import Constant, Normal, XavierUniform, Uniform

mlp_bias_normal_ = Normal(std=1e-6)
pos_normal_ = Normal(std=0.02)
xavier_uniform_ = XavierUniform()
zeros_ = Constant(value=0.)
minus_tens_ = Constant(value=-10.)
ones_ = Constant(value=1.)


def xavier_uniform_2d_(param, axis=-1):
    fan_in = int(np.prod(param.shape[:axis]))
    fan_out = int(np.prod(param.shape[axis:]))
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    uniform = Uniform(low=-limit, high=limit)
    uniform(param)


================================================
FILE: ppfleetx/models/vision_model/layers/mlp.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
from .initializer import xavier_uniform_, mlp_bias_normal_

__all__ = ['ViTMLP', ]


class ViTMLP(nn.Layer):
    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            xavier_uniform_(m.weight)
            mlp_bias_normal_(m.bias)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


================================================
FILE: ppfleetx/models/vision_model/loss/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .cross_entropy import *


================================================
FILE: ppfleetx/models/vision_model/loss/cross_entropy.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

__all__ = [
    'ViTCELoss',
    'CELoss',
]


class CELoss(nn.Layer):
    """
    Softmax Cross entropy loss
    """

    def __init__(self, epsilon=None):
        super().__init__()
        if epsilon is not None:
            assert epsilon >= 0 and epsilon <= 1, "epsilon must be in [0, 1]"
        self.epsilon = epsilon

    def _labelsmoothing(self, target, class_num):
        if len(target.shape) == 1 or target.shape[-1] != class_num:
            one_hot_target = F.one_hot(target, class_num)
        else:
            one_hot_target = target
        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
        return soft_target

    def forward(self, x, label):
        if isinstance(x, dict):
            x = x["logits"]
        if self.epsilon is not None:
            class_num = x.shape[-1]
            label = self._labelsmoothing(label, class_num)
            x = -F.log_softmax(x, axis=-1)
            loss = paddle.sum(x * label, axis=-1)
        else:
            if label.shape[-1] == x.shape[-1]:
                loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1)
            else:
                if label.dtype == paddle.int32:
                    label = paddle.cast(label, 'int64')
                loss = F.cross_entropy(x, label=label, soft_label=False)
        loss = loss.mean()
        return loss


class ViTCELoss(nn.Layer):
    """
    ViT style Sigmoid Cross entropy loss
    """

    def __init__(self, epsilon=None):
        super().__init__()
        if epsilon is not None:
            assert epsilon >= 0 and epsilon <= 1, "epsilon must be in [0, 1]"
        self.epsilon = epsilon

    def forward(self, x, label):
        if isinstance(x, dict):
            x = x["logits"]
        class_num = x.shape[-1]
        if len(label.shape) == 1 or label.shape[-1] != class_num:
            label = F.one_hot(label, class_num)
            label = paddle.reshape(label, shape=[-1, class_num])
        if self.epsilon is not None:
            # vit style label smoothing
            with paddle.no_grad():
                label = label * (1.0 - self.epsilon) + self.epsilon

        if x.dtype == paddle.float16:
            x = paddle.cast(x, 'float32')
        loss = F.binary_cross_entropy_with_logits(x, label, reduction='none')
        loss = paddle.sum(loss, axis=-1)
        loss = loss.mean()

        return loss


================================================
FILE: ppfleetx/models/vision_model/metrics/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .accuracy import *


================================================
FILE: ppfleetx/models/vision_model/metrics/accuracy.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn


class TopkAcc(nn.Layer):
    def __init__(self, topk=(1, 5)):
        super().__init__()
        assert isinstance(topk, (int, list, tuple))
        if isinstance(topk, int):
            topk = [topk]
        self.topk = topk

    def forward(self, x, label):
        if isinstance(x, dict):
            x = x["logits"]

        if len(label.shape) == 1:
            label = label.reshape([label.shape[0], -1])

        if label.dtype == paddle.int32:
            label = paddle.cast(label, 'int64')
        metric_dict = dict()
        for i, k in enumerate(self.topk):
            acc = paddle.metric.accuracy(x, label, k=k).item()
            metric_dict["top{}".format(k)] = acc
            if i == 0:
                metric_dict["metric"] = acc

        return metric_dict


================================================
FILE: ppfleetx/models/vision_model/moco/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .moco import *


================================================
FILE: ppfleetx/models/vision_model/moco/moco.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Callable

import os
import copy
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import Constant, Normal

from ..layers.identity import Identity

__all__ = [
    'MoCo',
    'MoCoV2Projector',
    'MoCoClassifier',
]


@paddle.no_grad()
def concat_all_gather(tensor):
    """
    Performs all_gather operation on the provided tensors.
    """
    if paddle.distributed.get_world_size() < 2:
        return tensor

    tensors_gather = []
    paddle.distributed.all_gather(tensors_gather, tensor)

    output = paddle.concat(tensors_gather, axis=0)
    return output


class MoCoV2Projector(nn.Layer):
    def __init__(self, with_pool, in_dim, out_dim):
        super().__init__()

        self.with_pool = with_pool
        if with_pool:
            self.avgpool = nn.Sequential(
                nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1))

        self.mlp = nn.Sequential(nn.Linear(in_dim, out_dim), nn.ReLU())

    def forward(self, x):

        if self.with_pool:
            x = self.avgpool(x)

        x = self.mlp(x)
        return x


class MoCoClassifier(nn.Layer):
    def __init__(self, with_pool, num_features, num_classes):
        super().__init__()

        self.with_pool = with_pool
        if with_pool:
            self.avgpool = nn.Sequential(
                nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1))

        self.fc = nn.Linear(num_features, num_classes)
        normal_ = Normal(std=0.01)
        zeros_ = Constant(value=0.)

        normal_(self.fc.weight)
        zeros_(self.fc.bias)

    def forward(self, x):

        if self.with_pool:
            x = self.avgpool(x)
        x = self.fc(x)
        return x


class MoCo(nn.Layer):
    """ MoCo v1, v2
    
    ref: https://github.com/facebookresearch/moco/blob/main/moco/builder.py
    ref: https://github.com/PaddlePaddle/PASSL/blob/main/passl/modeling/architectures/moco.py
    """

    def __init__(self,
                 base_encoder,
                 base_projector,
                 base_classifier,
                 momentum_encoder,
                 momentum_projector,
                 momentum_classifier,
                 dim=128,
                 K=65536,
                 m=0.999,
                 T=0.07,
                 **kwargs):
        super(MoCo, self).__init__()

        self.m = m
        self.T = T
        self.K = K

        self.base_encoder = nn.Sequential(base_encoder, base_projector,
                                          base_classifier)
        self.momentum_encoder = nn.Sequential(
            momentum_encoder, momentum_projector, momentum_classifier)

        for param_b, param_m in zip(self.base_encoder.parameters(),
                                    self.momentum_encoder.parameters()):
            param_m.copy_(param_b, False)  # initialize
            param_m.stop_gradient = True  # not update by gradient

        # create the queue
        self.register_buffer("queue", paddle.randn([dim, K]))
        self.queue = F.normalize(self.queue, axis=0)

        self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))

    @paddle.no_grad()
    def _update_momentum_encoder(self):
        """Momentum update of the momentum encoder"""
        #Note(GuoxiaWang): disable auto cast when use mix_precision
        with paddle.amp.auto_cast(False):
            for param_b, param_m in zip(self.base_encoder.parameters(),
                                        self.momentum_encoder.parameters()):
                paddle.assign((param_m * self.m + param_b * (1. - self.m)),
                              param_m)
                param_m.stop_gradient = True

    @paddle.no_grad()
    def _dequeue_and_enqueue(self, keys):
        keys = concat_all_gather(keys)

        batch_size = keys.shape[0]

        ptr = int(self.queue_ptr[0])
        assert self.K % batch_size == 0  # for simplicity

        # replace the keys at ptr (dequeue and enqueue)
        self.queue[:, ptr:ptr + batch_size] = keys.transpose([1, 0])
        ptr = (ptr + batch_size) % self.K  # move pointer

        self.queue_ptr[0] = ptr

    @paddle.no_grad()
    def _batch_shuffle_ddp(self, x):
        """
        Batch shuffle, for making use of BatchNorm.
        *** Only support DistributedDataParallel (DDP) model. ***
        """
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # random shuffle index
        idx_shuffle = paddle.randperm(batch_size_all)

        # broadcast to all gpus
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.broadcast(idx_shuffle, src=0)

        # index for restoring
        idx_unshuffle = paddle.argsort(idx_shuffle)

        # shuffled index for this gpu
        gpu_idx = paddle.distributed.get_rank()
        idx_this = idx_shuffle.reshape([num_gpus, -1])[gpu_idx]
        return paddle.gather(x_gather, idx_this, axis=0), idx_unshuffle

    @paddle.no_grad()
    def _batch_unshuffle_ddp(self, x, idx_unshuffle):
        """
        Undo batch shuffle.
        *** Only support DistributedDataParallel (DDP) model. ***
        """
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # restored index for this gpu
        gpu_idx = paddle.distributed.get_rank()
        idx_this = idx_unshuffle.reshape([num_gpus, -1])[gpu_idx]

        return paddle.gather(x_gather, idx_this, axis=0)

    def forward(self, x1, x2):

        # compute query features
        q = self.base_encoder(x1)  # queries: NxC
        q = F.normalize(q, axis=1)

        # compute key features
        with paddle.no_grad():  # no gradient
            self._update_momentum_encoder()  # update the momentum encoder

            # shuffle for making use of BN
            k, idx_unshuffle = self._batch_shuffle_ddp(x2)

            k = self.momentum_encoder(k)  # keys: NxC
            k = F.normalize(k, axis=1)

            # undo shuffle
            k = self._batch_unshuffle_ddp(k, idx_unshuffle)

        # compute logits
        # Einstein sum is more intuitive
        # positive logits: Nx1
        l_pos = paddle.sum(q * k, axis=1).unsqueeze(-1)
        # negative logits: NxK
        l_neg = paddle.matmul(q, self.queue.clone().detach())

        # logits: Nx(1+K)
        logits = paddle.concat((l_pos, l_neg), axis=1)

        # apply temperature
        logits /= self.T

        # labels: positive key indicators
        labels = paddle.zeros([logits.shape[0]], dtype=paddle.int64)

        # dequeue and enqueue
        self._dequeue_and_enqueue(k)

        return (logits, labels)


================================================
FILE: ppfleetx/models/vision_model/moco_module.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
import copy
import datetime
from collections import defaultdict
import numpy as np

import paddle
import paddle.nn as nn
from ppfleetx.utils.log import logger

from ppfleetx.core.module.basic_module import BasicModule

from .factory import build
from .moco import MoCo


class MOCOModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        self.model_configs = copy.deepcopy(configs.Model)
        self.model_configs.pop('module')

        # must init before loss function
        super(MOCOModule, self).__init__(configs)

        assert 'train' in self.model_configs.loss
        self.loss_fn = build(self.model_configs.loss.train)

        self.train_batch_size = None
        self.best_metric = 0.0

    def get_model(self):
        if not hasattr(self, 'model') or self.model is None:
            config = copy.deepcopy(self.model_configs.model)
            base_encoder = build(self.model_configs.model.base_encoder)
            base_projector = build(
                self.model_configs.model.get('base_projector',
                                             {"name": "Identity"}))
            base_classifier = build(self.model_configs.model.base_classifier)
            momentum_encoder = build(self.model_configs.model.momentum_encoder)
            momentum_projector = build(
                self.model_configs.model.get('momentum_projector',
                                             {"name": "Identity"}))
            momentum_classifier = build(
                self.model_configs.model.momentum_classifier)

            config['base_encoder'] = base_encoder
            config['base_projector'] = base_projector
            config['base_classifier'] = base_classifier
            config['momentum_encoder'] = momentum_encoder
            config['momentum_projector'] = momentum_projector
            config['momentum_classifier'] = momentum_classifier

            self.model = MoCo(**config)
        return self.model

    def forward(self, img_q, img_k):
        return self.model(img_q, img_k)

    def training_step(self, batch):
        img_q, img_k = batch

        # Note(GuoxiaWang)paddle.distributed.all_gather required CudaPlace
        img_q = img_q.cuda()
        img_k = img_k.cuda()

        if self.train_batch_size is None:
            self.train_batch_size = img_q.shape[
                0] * paddle.distributed.get_world_size()

        logits, labels = self(img_q, img_k)
        loss = self.loss_fn(logits, labels)

        return loss

    def training_step_end(self, log_dict):
        ips = self.train_batch_size / log_dict['train_cost']

        total_step = log_dict['total_epoch'] * log_dict['total_batch']
        cur_step = log_dict['epoch'] * log_dict['total_batch'] + log_dict[
            'batch'] + 1
        remained_step = total_step - cur_step
        eta_sec = remained_step * log_dict['train_cost']
        eta_msg = "eta: {:s}".format(
            str(datetime.timedelta(seconds=int(eta_sec))))

        logger.info(
            "[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec, %s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],
               log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips,
               eta_msg))

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, 3, 224, 224], name="images", dtype='float32')
        ]

    def training_epoch_end(self, log_dict):
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (log_dict['epoch'], log_dict['train_cost']))


class MOCOClsModule(BasicModule):
    def __init__(self, configs):
        self.nranks = paddle.distributed.get_world_size()
        self.model_configs = copy.deepcopy(configs.Model)
        self.model_configs.pop('module')

        # must init before loss function
        super(MOCOClsModule, self).__init__(configs)

        assert 'train' in self.model_configs.loss
        self.loss_fn = build(self.model_configs.loss.train)
        self.eval_loss_fn = None
        if 'eval' in self.model_configs.loss:
            self.eval_loss_fn = build(self.model_configs.loss.eval)

        if 'train' in self.model_configs.metric:
            self.train_metric_fn = build(self.model_configs.metric.train)
        if 'eval' in self.model_configs.metric:
            self.eval_metric_fn = build(self.model_configs.metric.eval)

        self.train_batch_size = None
        self.eval_batch_size = None
        self.best_metric = 0.0
        self.acc_list = []

    def _freeze_backbone(self, layer):
        for param in layer.parameters():
            param.trainable = False

        def freeze_norm(layer):
            if isinstance(layer, (nn.layer.norm._BatchNormBase)):
                layer._use_global_stats = True

        layer.apply(freeze_norm)

    def get_model(self):
        if not hasattr(self, 'model') or self.model is None:
            pretrained_path = self.model_configs.model.base_encoder.pop(
                "pretrained")
            base_encoder = build(self.model_configs.model.base_encoder)
            self._freeze_backbone(base_encoder)

            pretrained_path = pretrained_path + ".pdparams"
            assert os.path.exists(
                pretrained_path), f'{pretrained_path} is not exists!'
            base_encoder_dict = paddle.load(pretrained_path)

            for k in list(base_encoder_dict.keys()):
                # retain only encoder_q up to before the embedding layer
                if k.startswith('base_encoder.0.'):
                    # remove prefix
                    base_encoder_dict[k[len(
                        "base_encoder.0."):]] = base_encoder_dict[k]
                    # delete renamed
                    del base_encoder_dict[k]

            for name, param in base_encoder.state_dict().items():
                if name in base_encoder_dict and param.dtype != base_encoder_dict[
                        name].dtype:
                    base_encoder_dict[name] = base_encoder_dict[name].cast(
                        param.dtype)

            base_encoder.set_state_dict(base_encoder_dict)
            logger.info(f'Load pretrained weight from {pretrained_path}')

            base_classifier = build(self.model_configs.model.base_classifier)

            self.model = nn.Sequential(base_encoder, base_classifier)
        return self.model

    def forward(self, inputs):
        return self.model(inputs)

    def training_step(self, batch):
        inputs, labels = batch

        if self.train_batch_size is None:
            self.train_batch_size = inputs.shape[
                0] * paddle.distributed.get_world_size()

        inputs.stop_gradient = True
        labels.stop_gradient = True

        logits = self(inputs)
        loss = self.loss_fn(logits, labels)

        return loss

    def training_step_end(self, log_dict):
        ips = self.train_batch_size / log_dict['train_cost']

        total_step = log_dict['total_epoch'] * log_dict['total_batch']
        cur_step = log_dict['epoch'] * log_dict['total_batch'] + log_dict[
            'batch'] + 1
        remained_step = total_step - cur_step
        eta_sec = remained_step * log_dict['train_cost']
        eta_msg = "eta: {:s}".format(
            str(datetime.timedelta(seconds=int(eta_sec))))

        logger.info(
            "[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec, %s"
            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],
               log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips,
               eta_msg))

    def validation_step(self, batch):
        inputs, labels = batch

        batch_size = inputs.shape[0]

        inputs.stop_gradient = True
        labels.stop_gradient = True

        logits = self(inputs)
        loss = self.eval_loss_fn(logits, labels)

        if paddle.distributed.get_world_size() > 1:
            label_list = []
            paddle.distributed.all_gather(label_list, labels)
            labels = paddle.concat(label_list, 0)

            pred_list = []
            paddle.distributed.all_gather(pred_list, logits)
            logits = paddle.concat(pred_list, 0)

        if self.eval_batch_size is None:
            self.eval_batch_size = logits.shape[0]

        acc = self.eval_metric_fn(logits, labels)
        self.acc_list.append(acc)
        return loss

    def validation_step_end(self, log_dict):
        ips = self.eval_batch_size / log_dict['eval_cost']
        speed = self.configs['Engine']['logging_freq'] / log_dict['eval_cost']
        logger.info(
            "[eval] epoch: %d, step: [%d/%d], loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec"
            % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'],
               log_dict['loss'], log_dict['eval_cost'], ips))

    def input_spec(self):
        return [
            InputSpec(
                shape=[None, 3, 224, 224], name="images", dtype='float32')
        ]

    def training_epoch_end(self, log_dict):
        logger.info("[Training] epoch: %d, total time: %.5f sec" %
                    (log_dict['epoch'], log_dict['train_cost']))

    def validation_epoch_end(self, log_dict):
        msg = ''
        if len(self.acc_list) > 0:
            ret = defaultdict(list)

            for item in self.acc_list:
                for key, val in item.items():
                    ret[key].append(val)

            for k, v in ret.items():
                ret[k] = np.mean(v)

            if 'metric' in ret and ret['metric'] > self.best_metric:
                self.best_metric = ret['metric']

            if 'metric' in ret:
                ret['best_metric'] = self.best_metric

            msg = ', '
            msg += ", ".join([f'{k} = {v:.6f}' for k, v in ret.items()])
            self.acc_list.clear()

        logger.info("[Eval] epoch: %d, total time: %.5f sec%s" %
                    (log_dict['epoch'], log_dict['eval_cost'], msg))


================================================
FILE: ppfleetx/models/vision_model/resnet/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.vision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152

__all__ = [
    'resnet18',
    'resnet34',
    'resnet50',
    'resnet101',
    'resnet152',
]


================================================
FILE: ppfleetx/models/vision_model/vit/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .vit import *


================================================
FILE: ppfleetx/models/vision_model/vit/vit.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Callable

import os
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.incubate.nn import FusedMultiHeadAttention, FusedFeedForward

from ppfleetx.utils.log import logger
from ..layers.droppath import DropPath
from ..layers.identity import Identity
from ..layers.attention import ViTAttention
from ..layers.embedding import ViTPatchEmbed
from ..layers.mlp import ViTMLP
from ..layers.initializer import (xavier_uniform_, xavier_uniform_2d_,
                                  mlp_bias_normal_, zeros_, minus_tens_,
                                  pos_normal_, ones_)

__all__ = [
    'ViT_tiny_patch16_224',
    'ViT_base_patch16_224',
    'ViT_base_patch16_384',
    'ViT_base_patch32_224',
    'ViT_base_patch32_384',
    'ViT_large_patch16_224',
    'ViT_large_patch16_384',
    'ViT_large_patch32_224',
    'ViT_large_patch32_384',
    'ViT_huge_patch14_224',
    'ViT_huge_patch14_384',
    'ViT_g_patch14_224',
    'ViT_G_patch14_224',
    'ViT_6B_patch14_224',
    'ViT',
]


class FusedBlock(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer='nn.LayerNorm',
                 epsilon=1e-5):
        super().__init__()

        assert qk_scale is None, "Fused attention doesn't support qk_scale."
        if isinstance(drop_path, (float, int)):
            assert drop_path == 0.0, "Fused attention doesn't support drop_path."
        elif isinstance(drop_path, (tuple, list)):
            assert drop_path == [0.0] * len(
                drop_path), "Fused attention doesn't support drop_path."
        assert norm_layer == "nn.LayerNorm", "Fused attention only support nn.LayerNorm"
        assert ((act_layer == nn.GELU) or (act_layer == nn.ReLU)) or \
                (isinstance(act_layer, str) and act_layer.lower() == "gelu" or act_layer.lower() == "relu"), \
                "Fused attention only support GELU and ReLU activation."

        self.attn = FusedMultiHeadAttention(
            dim,
            num_heads=num_heads,
            qkv_bias_attr=qkv_bias,
            dropout_rate=drop,
            attn_dropout_rate=attn_drop,
            normalize_before=True,
            epsilon=epsilon)

        mlp_hidden_dim = int(dim * mlp_ratio)
        if (act_layer == nn.GELU) or act_layer.lower() == "gelu":
            act_func = "gelu"
        else:
            act_func = "relu"
        self.mlp = FusedFeedForward(
            d_model=dim,
            dim_feedforward=mlp_hidden_dim,
            dropout_rate=drop,
            activation=act_func,
            act_dropout_rate=drop,
            normalize_before=True)

        xavier_uniform_2d_(self.attn.qkv_weight)
        xavier_uniform_2d_(self.attn.linear_weight)
        xavier_uniform_2d_(self.mlp._linear1_weight)
        xavier_uniform_2d_(self.mlp._linear2_weight)

        zeros_(self.attn.qkv_bias)
        zeros_(self.attn.linear_bias)
        mlp_bias_normal_(self.mlp._linear1_bias)
        mlp_bias_normal_(self.mlp._linear2_bias)

    def forward(self, x):
        return self.mlp(self.attn(x))


class Block(nn.Layer):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 qk_scale=None,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer='nn.LayerNorm',
                 epsilon=1e-5):
        super().__init__()
        if isinstance(norm_layer, str):
            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
        elif isinstance(norm_layer, Callable):
            self.norm1 = norm_layer(dim)
        else:
            raise TypeError(
                "The norm_layer must be str or paddle.nn.layer.Layer class")
        self.attn = ViTAttention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        if isinstance(norm_layer, str):
            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
        elif isinstance(norm_layer, Callable):
            self.norm2 = norm_layer(dim)
        else:
            raise TypeError(
                "The norm_layer must be str or paddle.nn.layer.Layer class")
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = ViTMLP(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class ViT(nn.Layer):
    """ Vision Transformer with support for patch input
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 class_num=1000,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4,
                 qkv_bias=False,
                 qk_scale=None,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 norm_layer='nn.LayerNorm',
                 epsilon=1e-5,
                 representation_size=None,
                 use_fused_attn=False,
                 **kwargs):
        super().__init__()
        self.class_num = class_num
        self.representation_size = representation_size
        self.num_heads = num_heads
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = ViTPatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.pos_embed = self.create_parameter(
            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
        self.cls_token = self.create_parameter(
            shape=(1, 1, embed_dim), default_initializer=zeros_)
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = np.linspace(0, drop_path_rate, depth)

        self.use_fused_attn = use_fused_attn
        block_fn = FusedBlock if self.use_fused_attn else Block
        if self.use_fused_attn:
            logger.info(
                "ViT use fused attention. Fused attention model checkpoint will be" \
                " saved in normal attention format for inference checkpoint export," \
                " and its optimizer checkpoint keeps the same.")
        self.blocks = nn.LayerList([
            block_fn(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                epsilon=epsilon) for i in range(depth)
        ])

        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)

        # Classifier head
        if self.representation_size is not None:
            self.head0 = nn.Linear(embed_dim, representation_size)
            self.tanh = nn.Tanh()
            self.head = nn.Linear(representation_size,
                                  class_num) if class_num > 0 else Identity()
            xavier_uniform_(self.head0.weight)
            zeros_(self.head0.bias)
            xavier_uniform_(self.head.weight)
            minus_tens_(self.head.bias)
        else:
            self.head = nn.Linear(embed_dim,
                                  class_num) if class_num > 0 else Identity()
            zeros_(self.head.weight)
            zeros_(self.head.bias)

        pos_normal_(self.pos_embed)
        zeros_(self.cls_token)
        self.apply(self._init_weights)

        pretrained_configs = kwargs.pop('pretrained', None)
        if pretrained_configs is not None:
            self.load_pretrained(**pretrained_configs)

    def _init_weights(self, m):
        if isinstance(m, nn.LayerNorm):
            zeros_(m.bias)
            ones_(m.weight)

    def forward_features(self, x):
        # B = x.shape[0]
        B = paddle.shape(x)[0]
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand((B, -1, -1))
        x = paddle.concat((cls_tokens, x), axis=1)
        x = x + self.pos_embed
        x = self.pos_drop(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return x[:, 0]

    def forward(self, x):
        x = self.forward_features(x)
        if self.representation_size is not None:
            x = self.tanh(self.head0(x))
        x = self.head(x)
        return x

    # Saved the fused attention checkpoint in origin attention checkpoint format
    replaced_dict = {
        # FusedMultiHeadAttention
        'attn.pre_ln_scale': 'norm1.weight',
        'attn.pre_ln_bias': 'norm1.bias',
        'attn.qkv_weight': 'attn.qkv.weight',
        'attn.qkv_bias': 'attn.qkv.bias',
        'attn.linear_weight': 'attn.proj.weight',
        'attn.linear_bias': 'attn.proj.bias',
        # FusedFeedForward
        'mlp._ln1_scale': 'norm2.weight',
        'mlp._ln1_bias': 'norm2.bias',
        'mlp._linear1_weight': 'mlp.fc1.weight',
        'mlp._linear1_bias': 'mlp.fc1.bias',
        'mlp._linear2_weight': 'mlp.fc2.weight',
        'mlp._linear2_bias': 'mlp.fc2.bias',
    }

    @paddle.no_grad()
    def state_dict(self,
                   destination=None,
                   include_sublayers=True,
                   structured_name_prefix="",
                   use_hook=True):
        state_dict = super().state_dict(destination, include_sublayers,
                                        structured_name_prefix, use_hook)
        if self.use_fused_attn:
            new_dict = []
            poped_keys = []
            for key, value in state_dict.items():
                new_key = ""
                for k, v in self.replaced_dict.items():
                    if k in key:
                        new_key = key.replace(k, v)
                        break
                if new_key != "":
                    value_name = value.name
                    if 'attn.qkv.weight' in new_key:
                        value = value.reshape([-1, value.shape[-1]]).transpose(
                            [1, 0])
                    if 'attn.qkv.bias' in new_key:
                        value = value.reshape([-1])
                    # value is a Tensor after transformation,
                    # it will be transformed to ParamBase for auto_infer
                    param = paddle.create_parameter(
                        shape=value.shape, dtype=value.dtype)
                    param.set_value(value)
                    param.name = value_name
                    new_dict.append({new_key: param})
                    poped_keys.append(key)

            for i in range(len(new_dict)):
                state_dict.update(new_dict[i])
                state_dict.pop(poped_keys[i])
        return state_dict

    @paddle.no_grad()
    def set_state_dict(self, state_dict, use_structured_name=True):
        reversed_replaced_dict = {}
        for k, v in self.replaced_dict.items():
            reversed_replaced_dict.update({v: k})

        if self.use_fused_attn:
            new_dict = []
            poped_keys = []
            for key, value in state_dict.items():
                new_key = ""
                for k, v in reversed_replaced_dict.items():
                    if k in key:
                        new_key = key.replace(k, v)
                        break
                if new_key != "":
                    if 'attn.qkv_weight' in new_key:
                        value = value.transpose([1, 0])
                        value = value.reshape(
                            [3, self.num_heads, -1, value.shape[-1]])
                    if 'attn.qkv_bias' in new_key:
                        value = value.reshape([3, self.num_heads, -1])
                    new_dict.append({new_key: value})
                    poped_keys.append(key)

            for i in range(len(new_dict)):
                state_dict.update(new_dict[i])
                state_dict.pop(poped_keys[i])
        super().set_state_dict(state_dict)

    def load_pretrained(self, prefix_path, finetune=False):
        if not os.path.exists(prefix_path + '.pdparams'):
            raise ValueError("Model pretrain path {} does not "
                             "exists.".format(prefix_path))

        state_dict = self.state_dict()
        param_state_dict = paddle.load(prefix_path + ".pdparams")

        # for FP16 saving pretrained weight
        for key, value in param_state_dict.items():
            param_state_dict[key] = param_state_dict[key].astype(
                paddle.float32)

        if not finetune:
            self.set_state_dict(param_state_dict)
            return

        for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']:
            if k in param_state_dict:
                print(f"Removing key {k} from pretrained checkpoint")
                del param_state_dict[k]

        # interpolate position embedding
        pos_embed_checkpoint = param_state_dict['pos_embed']
        embedding_size = pos_embed_checkpoint.shape[-1]
        num_patches = self.patch_embed.num_patches
        num_extra_tokens = self.pos_embed.shape[-2] - num_patches
        # height (== width) for the checkpoint position embedding
        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
                        0.5)
        # height (== width) for the new position embedding
        new_size = int(num_patches**0.5)
        # class_token and dist_token are kept unchanged
        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
        # only the position tokens are interpolated
        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
        pos_tokens = paddle.transpose(
            pos_tokens.reshape([-1, orig_size, orig_size, embedding_size]),
            perm=[0, 3, 1, 2])
        dtype = pos_tokens.dtype
        pos_tokens = paddle.nn.functional.interpolate(
            pos_tokens.astype(paddle.float32),
            size=(new_size, new_size),
            mode='bicubic',
            align_corners=False).astype(dtype)
        pos_tokens = paddle.transpose(
            pos_tokens, perm=[0, 2, 3, 1]).flatten(1, 2)
        new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
        param_state_dict['pos_embed'] = new_pos_embed

        self.set_state_dict(param_state_dict)
        return


def ViT_tiny_patch16_224(**kwargs):
    model = ViT(patch_size=16,
                embed_dim=192,
                depth=12,
                num_heads=3,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=192,
                **kwargs)
    return model


def ViT_base_patch16_224(**kwargs):
    model = ViT(patch_size=16,
                embed_dim=768,
                depth=12,
                num_heads=12,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=768,
                **kwargs)
    return model


def ViT_base_patch16_384(**kwargs):
    model = ViT(img_size=384,
                patch_size=16,
                embed_dim=768,
                depth=12,
                num_heads=12,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=None,
                **kwargs)
    return model


def ViT_base_patch32_224(**kwargs):
    model = ViT(patch_size=32,
                embed_dim=768,
                depth=12,
                num_heads=12,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=768,
                **kwargs)
    return model


def ViT_base_patch32_384(**kwargs):
    model = ViT(img_size=384,
                patch_size=32,
                embed_dim=768,
                depth=12,
                num_heads=12,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=None,
                **kwargs)
    return model


def ViT_large_patch16_224(**kwargs):
    model = ViT(patch_size=16,
                embed_dim=1024,
                depth=24,
                num_heads=16,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=1024,
                **kwargs)
    return model


def ViT_large_patch16_384(**kwargs):
    model = ViT(img_size=384,
                patch_size=16,
                embed_dim=1024,
                depth=24,
                num_heads=16,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=None,
                **kwargs)
    return model


def ViT_large_patch32_224(**kwargs):
    model = ViT(patch_size=32,
                embed_dim=1024,
                depth=24,
                num_heads=16,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=1024,
                **kwargs)
    return model


def ViT_large_patch32_384(**kwargs):
    model = ViT(img_size=384,
                patch_size=32,
                embed_dim=1024,
                depth=24,
                num_heads=16,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=None,
                **kwargs)
    return model


def ViT_huge_patch14_224(**kwargs):
    model = ViT(patch_size=14,
                embed_dim=1280,
                depth=32,
                num_heads=16,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=1280,
                **kwargs)
    return model


def ViT_huge_patch14_384(**kwargs):
    model = ViT(img_size=384,
                patch_size=14,
                embed_dim=1280,
                depth=32,
                num_heads=16,
                mlp_ratio=4,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=None,
                **kwargs)
    return model


def ViT_g_patch14_224(**kwargs):
    model = ViT(img_size=224,
                patch_size=14,
                embed_dim=1408,
                depth=40,
                num_heads=16,
                mlp_ratio=4.364,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=1408,
                **kwargs)
    return model


def ViT_G_patch14_224(**kwargs):
    model = ViT(img_size=224,
                patch_size=14,
                embed_dim=1664,
                depth=48,
                num_heads=16,
                mlp_ratio=4.9231,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=1664,
                **kwargs)
    return model


def ViT_6B_patch14_224(**kwargs):
    model = ViT(img_size=224,
                patch_size=14,
                embed_dim=2320,
                depth=80,
                num_heads=16,
                mlp_ratio=4.955,
                qkv_bias=True,
                epsilon=1e-6,
                representation_size=2320,
                **kwargs)
    return model


================================================
FILE: ppfleetx/ops/setup_cuda.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.utils.cpp_extension import CUDAExtension, setup

setup(
    name='ppfleetx_ops',
    ext_modules=CUDAExtension(sources=['topp_sampling.cu']))


================================================
FILE: ppfleetx/ops/test_topp_sampling.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import numpy as np
from ppfleetx.ops import topp_sampling

paddle.seed(2022)

x = paddle.randn([1, 51200], dtype="float16")
x = paddle.nn.functional.softmax(x)
top_ps = paddle.to_tensor(np.random.uniform(0, 1, [1]).astype(np.float16))
out = topp_sampling(x, top_ps)
print(out)


================================================
FILE: ppfleetx/ops/topp_sampling.cu
================================================
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <curand_kernel.h>
#include <cuda_fp16.h>
#include "cub/cub.cuh"
#include "paddle/extension.h"

#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")

#define FINAL_MASK 0xFFFFFFFF

#define FIXED_BLOCK_DIM_BASE(dim, ...) \
  case (dim): {                        \
    constexpr auto kBlockDim = (dim);  \
    __VA_ARGS__;                       \
  } break


#define FIXED_BLOCK_DIM(...)                 \
  FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \
  FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__);  \
  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__);  \
  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__);  \
  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);   \
  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)

template <paddle::DataType D>
class PDTraits;

template <>
class PDTraits<paddle::DataType::FLOAT32> {
public:
  typedef float DataType;
  typedef float data_t;
};

template <>
class PDTraits<paddle::DataType::FLOAT16> {
public:
  typedef half DataType;
  typedef paddle::float16 data_t;
};

struct SegmentOffsetIter {
    explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}

    __host__ __device__ __forceinline__ int operator()(int idx) const {
        return idx * num_cols_;
    }

    int num_cols_;
};

template <typename T>
struct Pair {
  __device__ __forceinline__ Pair() {}
  __device__ __forceinline__ Pair(T value, int id) : v(value), id(id) {}

  __device__ __forceinline__ void set(T value, int id) {
    v = value;
    id = id;
  }

  __device__ __forceinline__ void operator=(const Pair<T>& in) {
    v = in.v;
    id = in.id;
  }

  __device__ __forceinline__ bool operator<(const T value) const {
    return ((float)v < (float)value);
  }

  __device__ __forceinline__ bool operator>(const T value) const {
    return ((float)v > (float)value);
  }
  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
    return ((float)v < (float)in.v) || (((float)v == (float)in.v) && (id > in.id));
  }

  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
    return ((float)v > (float)in.v) || (((float)v == (float)in.v) && (id < in.id));
  }

  T v;
  int id;
};

inline int div_up(int a, int n)
{
    return (a + n - 1) / n;
}

__global__ void setup_kernel(curandState_t *state, const uint64_t seed, const int bs) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = idx; i < bs; i += gridDim.x * blockDim.x) {
    curand_init(seed, 0, i, &state[i]);
  }
}

template <typename T>
__device__ __forceinline__ void AddTo(Pair<T> topk[],
                                      const Pair<T>& p,
                                      int beam_size) {
  for (int k = beam_size - 2; k >= 0; k--) {
    if (topk[k] < p) {
    topk[k + 1] = topk[k];
    } else {
    topk[k + 1] = p;
    return;
    }
  }
  topk[0] = p;
}

template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[],
                                        const T* src,
                                        int idx,
                                        int dim,
                                        int beam_size) {
  while (idx < dim) {
    if (topk[beam_size - 1] < src[idx]) {
    Pair<T> tmp(src[idx], idx);
    AddTo<T>(topk, tmp, beam_size);
    }
    idx += BlockSize;
  }
}

template <typename T, int BlockSize>
__device__ __forceinline__ void GetTopK(Pair<T> topk[],
                                        const T* src,
                                        int idx,
                                        int dim,
                                        const Pair<T>& max,
                                        int beam_size) {
  while (idx < dim) {
    if (topk[beam_size - 1] < src[idx]) {
        Pair<T> tmp(src[idx], idx);
        if (tmp < max) {
            AddTo<T>(topk, tmp, beam_size);
        }
    }
    idx += BlockSize;
  }
}

template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
                                              int* beam,
                                              int beam_size,
                                              const T* src,
                                              bool* firstStep,
                                              bool* is_empty,
                                              Pair<T>* max,
                                              int dim,
                                              const int tid) {
  if (*beam > 0) {
    int length = (*beam) < beam_size ? *beam : beam_size;
    if (*firstStep) {
      *firstStep = false;
      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
    } else {
      for (int k = 0; k < MaxLength; k++) {
        if (k < MaxLength - (*beam)) {
          topk[k] = topk[k + *beam];
        } else {
            topk[k].set(std::numeric_limits<T>::min(), -1);
        }
      }
      if (!(*is_empty)) {
        GetTopK<T, BlockSize>(
            topk + MaxLength - *beam, src, tid, dim, *max, length);
      }
    }

    *max = topk[MaxLength - 1];
    if ((*max).id == -1) *is_empty = true;
    *beam = 0;
  }
}

template <typename T>
__forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input) {
#pragma unroll
    for (int offset = 16; offset > 0; offset >>= 1) {
        T tmp_val = __shfl_down_sync(FINAL_MASK, input.v, static_cast<unsigned>(offset), 32);
        int tmp_id = __shfl_down_sync(FINAL_MASK, input.id, static_cast<unsigned>(offset), 32);
        if ((float)input.v < (float)tmp_val) {
            input.v = tmp_val;
            input.id = tmp_id;
        }
    }
    return input;
}

template <typename T, int MaxLength, int BlockSize>
__device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
                                            Pair<T> topk[],
                                            Pair<T> beam_max[],
                                            int* beam,
                                            int* k,
                                            int *count,
                                            const int tid,
                                            const int wid,
                                            const int lane) {
  while (true) {
    __syncthreads();
    Pair<T> input_now = topk[0];
    input_now = WarpReduce(input_now);

    if (lane == 0) {
      shared_max[wid] = input_now;
    }
    __syncthreads();
    input_now = (tid < BlockSize / 32)
                    ? shared_max[lane]
                    : Pair<T>(std::numeric_limits<T>::min(), -1);
    if (wid == 0) {
      input_now = WarpReduce(input_now);
      if (lane == 0) shared_max[0] = input_now;
    }
    __syncthreads();
    if (tid == 0) {
      beam_max[*count] = shared_max[0]; 
      (*count)++;
    }
    int tid_max = shared_max[0].id % BlockSize;
    if (tid == tid_max) {
      (*beam)++;
    }
    if (--(*k) == 0) break;
    __syncthreads();

    if (tid == tid_max) {
        if (*beam < MaxLength) {
            topk[0] = topk[*beam];
        }
    }

    if (MaxLength < 5) {
      if (*beam >= MaxLength) break;
    } else {
      unsigned mask = 0u;
      mask = __ballot_sync(FINAL_MASK, true);
      if (tid_max / 32 == wid) {
        if (__shfl_down_sync(FINAL_MASK, *beam, tid_max % 32, 32) ==
            MaxLength)
          break;
      }
    }
  }
}

template <typename T, int MaxLength, int TopPBeamTopK, int BlockSize>
__global__ void KeMatrixTopPBeamTopK(const T* src,
                                     T *top_ps,
                                     int64_t *out_id, // topk id
                                     T *out_val, // topk val
                                     int vocab_size,
                                     curandState_t *state,
                                     int *count_iter,
                                     int *count_iter_begin) {
    const int tid = threadIdx.x;
    const int wid = tid / 32;
    const int lane = tid % 32;
    const int bid = blockIdx.x;

    int top_num = TopPBeamTopK;
    float top_p_num = (float)top_ps[bid];

    __shared__ Pair<T> shared_max[BlockSize / 32];
    __shared__ Pair<T> beam_max[TopPBeamTopK];

    Pair<T> topk[MaxLength];
    int beam = MaxLength;
    Pair<T> max;
    bool is_empty = false;
    bool firststep = true;
    __shared__ int count;

    if (tid == 0) {
        count = 0;
    }

    for (int j = 0; j < MaxLength; j++) {
        topk[j].set(std::numeric_limits<T>::min(), -1);
    }

    while (top_num) {
        ThreadGetTopK<T, MaxLength, BlockSize>(topk,
                                               &beam,
                                               TopPBeamTopK,
                                               src + bid * vocab_size,
                                               &firststep,
                                               &is_empty,
                                               &max,
                                               vocab_size,
                                               tid);
        BlockReduce<T, MaxLength, BlockSize>(shared_max,
                                             topk,
                                             beam_max,
                                             &beam,
                                             &top_num,
                                             &count,
                                             tid,
                                             wid,
                                             lane);
    }
    if (tid == 0) {
        count_iter_begin[bid] = count_iter[bid];
        float rand_top_p = curand_uniform(state + bid) * top_p_num;
        top_ps[bid] = (T)rand_top_p;
        float sum_prob = 0.0f;
#pragma unroll
        for(int i = 0; i < TopPBeamTopK; i++) {
            sum_prob += (float)(beam_max[i].v);
            if(sum_prob >= rand_top_p) {
                count_iter_begin[bid] += 1;
                out_id[bid] = (int64_t)beam_max[i].id;
                out_val[bid] = beam_max[i].v;
                break;
            }
        }
    }
}

__global__ void SetCountIter(int *count_iter, int num) {
    int tid = threadIdx.x;
    int bid = blockIdx.x;
    int idx = bid * blockDim.x + tid;
    for (int i = idx; i < num; i += gridDim.x * blockDim.x) {
        count_iter[i] = i;
    }
}

template <typename T>
__global__ void FillIndex(T* indices, T num_rows, T num_cols) {
  int col_id = threadIdx.x;
  int row_id = blockIdx.x;

  for (T j = row_id; j < num_rows; j += gridDim.x) {
    for (T i = col_id; i < num_cols; i += blockDim.x) {
      indices[j * num_cols + i] = i;
    }
  }
}

struct BlockPrefixCallbackOp {
    // Running prefix
    float running_total;
    // Constructor
    __device__ BlockPrefixCallbackOp(float running_total): running_total(running_total) {}
    // Callback operator to be entered by the first warp of threads in the block.
    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
    __device__ float operator()(float block_aggregate)
    {
        float old_prefix = running_total;
        running_total += block_aggregate;
        return old_prefix;
    }
};

template <typename T, int BLOCK_SIZE>
__global__ void topp_sampling(T *sorted_probs,
                              int64_t *sorted_id,
                              T *out_val,
                              int64_t *out_id,
                              const T *top_ps,
                              int p_num,
                              int vocab_size,
                              int *count_iter,
                              int *count_iter_begin) {
    __shared__ int stop_shared;
    __shared__ float rand_p;
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    constexpr int WARP_SIZE = 32;
    constexpr int NUM_WARPS = BLOCK_SIZE / WARP_SIZE;
    const int lane_id = tid % WARP_SIZE;
    const int warp_id = tid / WARP_SIZE;
    const float p_t = (float)top_ps[bid];
    if (tid == 0) {
        stop_shared = 0;
        rand_p = p_t;
    }
    if (count_iter_begin[bid] == count_iter[bid + 1]) {
        // topk
        return;
    }

    typedef cub::BlockScan<float, BLOCK_SIZE>  BlockScan;
    __shared__ typename BlockScan::TempStorage temp_storage;
    __shared__ uint32_t selected_shared[NUM_WARPS];

    // Initialize running total
    BlockPrefixCallbackOp prefix_op(0);

    if (lane_id == 0) {
        selected_shared[warp_id] = 0;
    }
    __syncthreads();

    int offset = bid * vocab_size;
    int end = ((vocab_size + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
    int i_activate = 0;
    float thread_offset = 0;
    for (int i = tid; i < end; i += BLOCK_SIZE) {
        float thread_count = (i < vocab_size) ? (float)sorted_probs[offset + i] : 0.f;
        BlockScan(temp_storage).InclusiveSum(thread_count, thread_offset, prefix_op);
    
        uint32_t activate_mask = __ballot_sync(FINAL_MASK, rand_p <= thread_offset);
        
        i_activate = i;
        if (activate_mask != 0) {
            if (lane_id == 0) {
                atomicAdd(&stop_shared, 1);
                selected_shared[warp_id] = activate_mask;
            }
        }
        __syncthreads();
        if(stop_shared > 0) {
            break;
        }
    }

    bool skip = (selected_shared[warp_id] > 0) ? false : true;
    for (int i=0; i < warp_id; i++) {
        if(selected_shared[i] != 0) {
            skip = true;
        }
    }
    if (!skip) {
        int active_lane_id = WARP_SIZE - __popc(selected_shared[warp_id]); // first not 0
        if (lane_id == active_lane_id) {
            // printf("active_lane_id: %d, i_activate: %d.\n", active_lane_id, i_activate);
            // for (int i=0; i < active_lane_id; i++) {
            //   printf("p %d, value: %f\n", i, (float)(sorted_probs[offset + i]));
            // }
            out_id[bid] = sorted_id[offset + i_activate];
            out_val[bid] = sorted_probs[offset + i_activate];
        }
    }
}

int GetBlockSize(int vocab_size) {
    if (vocab_size > 512) {
        return 1024;
    } else if (vocab_size > 256) {
        return 512;
    } else if (vocab_size > 128) {
        return 256;
    } else if (vocab_size > 64) {
        return 128;
    } else {
        return 64;
    }
}

template <typename T>
__global__ void print_kernel(T *input, int size) {
  printf("[");
  for (int i=0; i < size; i++) {
    if (i != size-1) {
      printf("%f, ", (float)input[i]);
    } else {
      printf("%f]\n", (float)input[i]);
    }
  }
}

template <paddle::DataType D>
std::vector<paddle::Tensor> top_p_sampling_kernel(const paddle::Tensor& x, const paddle::Tensor& top_ps, int random_seed) {
    typedef PDTraits<D> traits_;
    typedef typename traits_::DataType DataType_;
    typedef typename traits_::data_t data_t;
    std::vector<int64_t> shape = x.shape();
    auto cu_stream = x.stream();

    int bs = shape[0];
    int p_num = top_ps.numel();
    PD_CHECK(bs == p_num, "PD_CHECK returns ", false, ", expected bs == p_num.");
    int vocab_size = shape[1];
    auto topp_ids = paddle::full({bs, 1}, 1, paddle::DataType::INT64, x.place());
    auto topp_probs = paddle::full({bs, 1}, 1, x.dtype(), x.place());
    auto inds_input = paddle::full({bs, vocab_size}, 1, paddle::DataType::INT64, x.place());
    auto sorted_out = paddle::full({bs, vocab_size}, 1, x.dtype(), x.place());
    auto sorted_id = paddle::full({bs, vocab_size}, 1, paddle::DataType::INT64, x.place());
    

    int BlockSize = GetBlockSize(vocab_size);
    switch (BlockSize) {
        FIXED_BLOCK_DIM(FillIndex<int64_t><<<bs, kBlockDim, 0, cu_stream>>>(inds_input.data<int64_t>(), bs, vocab_size));
        default:
            PD_THROW("the input data shape has error in the FillIndex kernel.");
    }

    
    static int count = 0;
    static curandState_t* dev_curand_states;
    if (count == 0) {
#if CUDA_VERSION >= 11020
      cudaMallocAsync(&dev_curand_states, bs * sizeof(curandState_t), cu_stream);
#else
      cudaMalloc(&dev_curand_states, bs * sizeof(curandState_t));
#endif
    }
    srand((unsigned int)(time(NULL)));
    setup_kernel<<<1, 256, 0, cu_stream>>>(dev_curand_states, rand() % random_seed, bs);
    PD_CHECK(bs == p_num, "PD_CHECK returns ", false, ", expected bs == p_num.");

    auto count_iter = paddle::empty({bs + 1}, paddle::DataType::INT32, x.place());
    auto count_iter_begin = paddle::empty({bs}, paddle::DataType::INT32, x.place());
    SetCountIter<<<1, 256, 0, cu_stream>>>(count_iter.data<int>(), bs + 1);

    constexpr int TopKMaxLength = 1;
    constexpr int TopPBeamTopK = 1;
    switch (BlockSize) {
        FIXED_BLOCK_DIM(
            KeMatrixTopPBeamTopK<DataType_, TopKMaxLength, TopPBeamTopK, kBlockDim><<<bs, kBlockDim, 0, cu_stream>>>(
                reinterpret_cast<DataType_*>(const_cast<data_t*>(x.data<data_t>())),
                reinterpret_cast<DataType_*>(const_cast<data_t*>(top_ps.data<data_t>())),
                topp_ids.data<int64_t>(),
                reinterpret_cast<DataType_*>(topp_probs.data<data_t>()),
                vocab_size,
                dev_curand_states,
                count_iter.data<int>(),
                count_iter_begin.data<int>()));
        default:
            PD_THROW("the input data shape has error in the topp_beam_topk kernel.");
    }
//     if (count % random_seed == random_seed - 1) {
// #if CUDA_VERSION >= 11020
//       cudaFreeAsync(dev_curand_states, cu_stream);
// #else
//       cudaFree(dev_curand_states);
// #endif
//     }
    count++;

    size_t temp_storage_bytes = 0;

    cub::TransformInputIterator<int, SegmentOffsetIter, int*>
        segment_offsets_t_begin(count_iter_begin.data<int>(),
                                SegmentOffsetIter(vocab_size));

    cub::TransformInputIterator<int, SegmentOffsetIter, int*>
        segment_offsets_t_end(count_iter.data<int>(),
                              SegmentOffsetIter(vocab_size));
    
    DataType_ *x_ptr = reinterpret_cast<DataType_*>(const_cast<data_t*>(x.data<data_t>()));
    DataType_ *sorted_out_ptr = reinterpret_cast<DataType_*>(const_cast<data_t*>(sorted_out.data<data_t>()));
    int64_t *in_id_ptr = inds_input.data<int64_t>();
    int64_t *out_id_ptr = sorted_id.data<int64_t>();

    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
                                                       temp_storage_bytes,
                                                       x_ptr,
                                                       sorted_out_ptr,
                                                       in_id_ptr,
                                                       out_id_ptr,
                                                       vocab_size * bs,
                                                       bs,
                                                       segment_offsets_t_begin,
                                                       segment_offsets_t_end + 1,
                                                       0,
                                                       sizeof(data_t) * 8,
                                                       cu_stream);

    temp_storage_bytes = div_up(temp_storage_bytes, 256) * 256;
    int64_t temp_size = temp_storage_bytes;
    auto temp_storage = paddle::empty({temp_size}, paddle::DataType::UINT8, x.place());

    cub::DeviceSegmentedRadixSort::SortPairsDescending(
        temp_storage.data<uint8_t>(),
        temp_storage_bytes,
        x_ptr,
        sorted_out_ptr,
        in_id_ptr,
        out_id_ptr,
        vocab_size * bs,
        bs,
        segment_offsets_t_begin,
        segment_offsets_t_end + 1,
        0,
        sizeof(data_t) * 8,
        cu_stream);

    switch (BlockSize) {
      FIXED_BLOCK_DIM(
          topp_sampling<DataType_, kBlockDim><<<bs, kBlockDim, 0, cu_stream>>>(
              sorted_out_ptr,
              out_id_ptr,
              reinterpret_cast<DataType_*>(topp_probs.data<data_t>()),
              topp_ids.data<int64_t>(),
              reinterpret_cast<DataType_*>(const_cast<data_t*>(top_ps.data<data_t>())),
              p_num,
              vocab_size,
              count_iter.data<int>(),
              count_iter_begin.data<int>()));
      default:
          PD_THROW("the input data shape has error in the topp_sampling kernel.");
    }
    return {topp_probs, topp_ids};
}


std::vector<paddle::Tensor> TopPSampling(const paddle::Tensor& x, const paddle::Tensor& top_ps, int random_seed) {
    switch (x.type()) {
        case paddle::DataType::FLOAT16: {
            return top_p_sampling_kernel<paddle::DataType::FLOAT16>(
                x,
                top_ps,
                random_seed
            );
        }
        case paddle::DataType::FLOAT32: {
            return top_p_sampling_kernel<paddle::DataType::FLOAT32>(
                x,
                top_ps,
                random_seed
            );
        }
        default: {
            PD_THROW(
                "NOT supported data type. "
                "Only float16 and float32 are supported. ");
            break;
        }
    }
}

std::vector<std::vector<int64_t>> TopPSamplingInferShape(const std::vector<int64_t>& x_shape,
                                                         const std::vector<int64_t>& top_ps_shape) {
    std::vector<int64_t> out_probs_shape = {x_shape[0], 1};                                                          
    std::vector<int64_t> out_ids_shape = {x_shape[0], 1};
    return {out_probs_shape, out_ids_shape};
}

std::vector<paddle::DataType> TopPSamplingInferDtype(const paddle::DataType& x_dtype,
                                                     const paddle::DataType& top_ps_dtype) {
    return {x_dtype, paddle::DataType::INT64};
}

PD_BUILD_OP(topp_sampling)
    .Inputs({"x", "top_ps"})
    .Outputs({"topp_probs", "topp_ids"})
    .Attrs({"random_seed: int"})
    .SetKernelFn(PD_KERNEL(TopPSampling))
    .SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingInferDtype));

================================================
FILE: ppfleetx/optims/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
import sys
import copy

import paddle
from paddle.optimizer.lr import LRScheduler

from .lr_scheduler import *
from .optimizer import *
from .grad_clip import *

from ppfleetx.utils.log import logger


def build_lr_scheduler(lr_config):
    if 'name' in lr_config:
        lr_name = lr_config.pop('name')
        lr = eval(lr_name)(**lr_config)
        if isinstance(lr, LRScheduler):
            return lr
        else:
            return lr()
    else:
        lr = lr_config.learning_rate

    logger.debug("build lr ({}) success..".format(lr))
    return lr


def build_grad_clip(grad_clip_config):
    if grad_clip_config is not None:
        grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
        clip_norm = grad_clip_config.get('clip_norm', 1.0)
        grad_clip = eval(grad_clip_name)(
            **grad_clip_config) if clip_norm != 0. else None
        return grad_clip
    else:
        return None


def build_optimizer(config, model, lr_scheduler=None):
    config = copy.deepcopy(config)
    if lr_scheduler is not None:
        config.pop('lr')

    multi_precision = config.get('multi_precision', False)
    if multi_precision:
        paddle.nn.clip._clip_by_global_norm_using_mp_type(True)

    grad_clip_config = config.pop('grad_clip', None)
    grad_clip = build_grad_clip(grad_clip_config)

    optim_name = config.pop('name')
    optim = eval(optim_name)(learning_rate=lr_scheduler,
                             parameters=model.parameters(),
                             grad_clip=grad_clip,
                             **config)

    logger.debug("build optimizer ({}) success..".format(optim))
    return optim


================================================
FILE: ppfleetx/optims/grad_clip.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from paddle.nn.clip import ClipGradByGlobalNorm

from paddle.nn.clip import ClipGradBase, _squared_l2_norm
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid import core, layers
from paddle.distributed import collective
import paddle.distributed.fleet as fleet

from ppfleetx.distributed.apis import env


class ClipGradForMOEByGlobalNorm(ClipGradBase):
    def __init__(self, clip_norm):
        super(ClipGradForMOEByGlobalNorm, self).__init__()
        self.clip_norm = float(clip_norm)

        self.moe_group = None
        self.world_size = paddle.distributed.get_world_size()
        if self.world_size > 1:
            hcg = env.get_hcg()
            self.moe_group = hcg.get_expert_parallel_group()

    def __str__(self):
        return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)

    @staticmethod
    def get_l2_norm_pow(params_grads, sum_dtype=None):
        sum_square_list = []
        sum_square_list_fp16 = []
        sum_square_list_fp32 = []
        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            sum_square = _squared_l2_norm(merge_grad)
            if sum_square.dtype == core.VarDesc.VarType.FP16:
                sum_square_list_fp16.append(sum_square)
            elif sum_square.dtype == core.VarDesc.VarType.FP32:
                sum_square_list_fp32.append(sum_square)
            else:
                sum_square_list.append(sum_square)

        # all parameters have been filterd out
        if len(sum_square_list) + len(sum_square_list_fp16) + len(
                sum_square_list_fp32) == 0:
            return None, None
        assert sum_dtype in ["float64", "float32", None], \
            "sum's type must be float64/ float32 / None"
        if sum_dtype != "float64":
            sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"

        global_norm_var = []
        if len(sum_square_list_fp16) > 0:
            global_norm_var_fp16 = layers.concat(sum_square_list_fp16)
            global_norm_var_fp16 = layers.reduce_sum(global_norm_var_fp16)
            global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
        if len(sum_square_list_fp32) > 0:
            global_norm_var_fp32 = layers.concat(sum_square_list_fp32)
            global_norm_var_fp32 = layers.reduce_sum(global_norm_var_fp32)
            if sum_dtype == 'float32':
                global_norm_var.append(global_norm_var_fp32)
            else:
                global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
        if len(sum_square_list) > 0:
            global_norm_var_fp64 = layers.concat(sum_square_list)
            global_norm_var_fp64 = layers.reduce_sum(global_norm_var_fp64)
            global_norm_var.append(global_norm_var_fp64)
        global_norm_var = layers.concat(global_norm_var)
        global_norm_var = layers.reduce_sum(global_norm_var)
        return global_norm_var, sum_dtype

    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        normal_params_grads = []
        moe_params_grads = []

        # separate moe params from normal params
        if self.moe_group is not None and self.moe_group.nranks > 1:
            for p, g in params_grads:
                if "expert" in p.name or "gate" in p.name:
                    moe_params_grads.append((p, g))
                else:
                    normal_params_grads.append((p, g))
        else:
            normal_params_grads = params_grads

        # why to return sum_dtype?
        # we will call `get_l2_norm_pow` twice and the precisions may be different.
        # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype
        global_norm_var_normal, sum_dtype \
            = self.get_l2_norm_pow(normal_params_grads)
        global_norm_var_moe = None
        if len(moe_params_grads) > 0:
            global_norm_var_moe, _ \
                = self.get_l2_norm_pow(moe_params_grads, sum_dtype)
            if global_norm_var_moe is not None:
                collective.all_reduce(
                    global_norm_var_moe,
                    op=collective.ReduceOp.SUM,
                    group=self.moe_group)

        if global_norm_var_normal is None and global_norm_var_moe is None:
            return params_grads
        elif global_norm_var_normal is None:
            global_norm_var = global_norm_var_moe
        elif global_norm_var_moe is None:
            global_norm_var = global_norm_var_normal
        else:
            if global_norm_var_normal.dtype != global_norm_var_moe.dtype:
                # compared with normal norm, moe norm is the later one,
                # so its precision is no lower than normal norm
                global_norm_var_normal = \
                    global_norm_var_normal.astype(global_norm_var_moe.dtype)
            global_norm_var = global_norm_var_normal + global_norm_var_moe

        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(
                x=global_norm_var, y=max_global_norm))
        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)

        for p, g in params_grads:
            if g is None or getattr(p, 'need_clip', True) is False:
                continue

            if p.dtype == paddle.float16:
                g.scale_(clip_var_fp16)
            else:
                g.scale_(clip_var)

            p._reset_grad_inplace_version(True)

        return params_grads


================================================
FILE: ppfleetx/optims/lr_scheduler.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy
import warnings
from paddle import Tensor
from paddle.optimizer import lr
from paddle.optimizer.lr import LRScheduler

__all__ = [
    'CosineAnnealingWithWarmupDecay',
    'LinearDecayWithWarmup',
    'ViTLRScheduler',
    'MultiStepDecay',
    'CosineDecay',
]


class CosineAnnealingWithWarmupDecay(LRScheduler):
    def __init__(self,
                 max_lr,
                 min_lr,
                 warmup_rate,
                 decay_steps,
                 last_epoch=0,
                 verbose=False,
                 **kwargs):

        self.decay_steps = decay_steps
        self.warmup_step = warmup_rate * decay_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        super(CosineAnnealingWithWarmupDecay, self).__init__(
            max_lr, last_epoch, verbose)

    def get_lr(self):
        if self.warmup_step > 0 and self.last_epoch <= self.warmup_step:
            return float(self.max_lr) * (self.last_epoch) / self.warmup_step

        if self.last_epoch > self.decay_steps:
            return self.min_lr

        num_step_ = self.last_epoch - self.warmup_step
        decay_steps_ = self.decay_steps - self.warmup_step
        decay_ratio = float(num_step_) / float(decay_steps_)
        coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
        return self.min_lr + coeff * (self.max_lr - self.min_lr)

    def step(self, epoch=None):
        if epoch is None:
            self.last_epoch += 0
            self.last_lr = self.get_lr()
        else:
            self.last_epoch += epoch
            if hasattr(self, "_get_closed_form_lr"):
                self.last_lr = self._get_closed_form_lr()
            else:
                self.last_lr = self.get_lr()

        if self.verbose:
            print('Epoch {}: {} set learning rate to {}.'.format(
                self.last_epoch, self.__class__.__name__, self.last_lr))


class LinearDecayWithWarmup(LRScheduler):
    def __init__(self,
                 learning_rate,
                 step_each_epoch,
                 epochs,
                 warmup=0,
                 verbose=False,
                 last_epoch=-1,
                 **kwargs):
        if kwargs.get('total_steps', -1) > 0:
            self.T_max = total_steps
        else:
            self.T_max = epochs * step_each_epoch

        self.warmup_steps = warmup if isinstance(
            warmup, int) else int(math.floor(warmup * self.T_max))
        super(LinearDecayWithWarmup, self).__init__(learning_rate, last_epoch,
                                                    verbose)

    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return self.base_lr * (float(self.last_epoch) /
                                   float(max(1, self.warmup_steps)))
        return self.base_lr * max(0.0, 1.0 - self.last_epoch / self.T_max)


class ViTLRScheduler(LRScheduler):
    def __init__(self,
                 learning_rate,
                 step_each_epoch,
                 epochs,
                 decay_type='cosine',
                 linear_end=1e-5,
                 warmup_steps=0,
                 verbose=False,
                 last_epoch=-1,
                 **kwargs):

        self.linear_end = linear_end
        self.T_max = epochs * step_each_epoch
        self.warmup_steps = warmup_steps

        if self.warmup_steps >= self.T_max:
            self.warmup_steps = self.T_max - 1

        self.decay_type = decay_type
        self.last_epoch = last_epoch
        super(ViTLRScheduler, self).__init__(learning_rate, last_epoch,
                                             verbose)

    def get_lr(self):

        progress = (self.last_epoch - self.warmup_steps
                    ) / float(self.T_max - self.warmup_steps)
        progress = min(1.0, max(0.0, progress))

        if self.decay_type == 'linear':
            lr = self.linear_end + (self.base_lr - self.linear_end) * (
                1.0 - progress)
        elif self.decay_type == 'cosine':
            lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress))
        if self.warmup_steps:
            lr = lr * min(1.0, self.last_epoch / self.warmup_steps)

        return lr


class MultiStepDecay(lr.MultiStepDecay):
    def __init__(self,
                 learning_rate,
                 step_each_epoch,
                 epochs,
                 milestones,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False,
                 **kwargs):
        super(MultiStepDecay, self).__init__(
            learning_rate=learning_rate,
            milestones=milestones,
            gamma=gamma,
            last_epoch=last_epoch,
            verbose=verbose)


class CosineDecay(lr.LRScheduler):
    def __init__(self,
                 learning_rate,
                 step_each_epoch,
                 epochs,
                 update_unit='epoch',
                 warmups=0,
                 verbose=False,
                 last_epoch=-1,
                 **kwargs):

        self.T_max = epochs if update_unit == 'epoch' else step_each_epoch * epochs
        self.warmups = warmups if update_unit == 'epoch' else step_each_epoch * warmups

        assert self.warmups < self.T_max

        self.last_epoch = last_epoch
        super(CosineDecay, self).__init__(learning_rate, last_epoch, verbose)

    def get_lr(self):

        progress = (
            self.last_epoch - self.warmups) / float(self.T_max - self.warmups)
        progress = min(1.0, max(0.0, progress))

        if self.warmups:
            lr = lr * min(1.0, self.last_epoch / self.warmups)
        else:
            lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress))

        return lr


================================================
FILE: ppfleetx/optims/optimizer.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import paddle
import paddle.distributed.fleet as fleet

from ppfleetx.utils.tensor_fusion_helper import fused_parameters
from paddle.optimizer import Adam, AdamW, Momentum
from ppfleetx.distributed.apis import env

__all__ = [
    'Adam',
    'AdamW',
    'Momentum',
    'FusedAdamW',
]


class FusedAdamW(paddle.optimizer.AdamW):
    def __init__(self, learning_rate, parameters, grad_clip, **config):
        tensor_fusion = config.pop("tensor_fusion", False)

        if paddle.distributed.get_world_size() > 1:
            hcg = env.get_hcg()
            sharding_size = hcg.get_sharding_parallel_world_size()

        if tensor_fusion:
            self.decay_fused_tensors, self.all_fused_tensors = fused_parameters(
                parameters, sharding_size > 1)
            decay_params = [p.name for p in self.decay_fused_tensors]
        else:
            decay_params = [
                p.name for p in parameters
                if not any(nd in p.name for nd in ["bias", "norm", "b_0"])
            ]

        apply_decay_param_fun = lambda x: x in decay_params

        super().__init__(
            learning_rate=learning_rate,
            parameters=self.all_fused_tensors if tensor_fusion else parameters,
            grad_clip=grad_clip,
            apply_decay_param_fun=apply_decay_param_fun,
            **config)


================================================
FILE: ppfleetx/tools/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/tools/multiprocess_tool.py
================================================
#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import multiprocessing
from multiprocessing import Process
import math
import time
import os
import argparse
import warnings
"""
Multi-process batch processing tool

This tool provides a multi-process batch processing method. 
For example, multi-process batch download data, multi-process preprocessing data, etc.

The tool relies on executable shell commands or scripts. Its essence is to use Python's 
multi-process library to create multiple processes, and call executable commands or 
scripts through the os.system API.

Executable commands or scripts are passed in via a txt text file, organized by line. 
For example, the following example is download, unzip and delete example.

batch_cmd.txt

wget http://xxxx.com/0.tar && tar -xf 0.tar && rm 0.tar
wget http://xxxx.com/1.tar && tar -xf 1.tar && rm 1.tar
...
wget http://xxxx.com/99.tar && tar -xf 99.tar && rm 99.tar

How to run:

python multiprocess_tool.py --num_proc 10 --shell_cmd_list_filename batch_cmd.txt

"""


def process_fn(cmd_list):
    for cmd in cmd_list:
        try:
            ret = os.system(cmd)
            if ret != 0:
                raise Exception(f'execute command: {cmd} failed.')
        except Exception as e:
            print(e)


def read_command(shell_cmd_list_filename):
    shell_cmd_list = []
    with open(shell_cmd_list_filename, 'r') as f:
        for cmd in f:
            cmd = cmd.strip()
            shell_cmd_list.append(cmd)
    return shell_cmd_list


def parallel_process(cmd_list, nproc=20):
    if nproc > multiprocessing.cpu_count():
        warnings.warn(
            'The set number of processes exceeds the number of cpu cores, please confirm whether it is reasonable.'
        )
    num_cmd = len(cmd_list)
    num_cmd_part = (num_cmd + nproc - 1) // nproc
    workers = []
    for i in range(min(nproc, num_cmd)):
        start = i * num_cmd_part
        end = min(start + num_cmd_part, num_cmd)
        p = Process(target=process_fn, args=(cmd_list[start:end], ))
        workers.append(p)
        p.start()

    for p in workers:
        p.join()


def main(args):
    start = time.time()
    shell_cmd_list = read_command(args.shell_cmd_list_filename)
    parallel_process(shell_cmd_list, args.num_proc)
    end = time.time()
    print("Cost time: {:.2f}".format(end - start))


if __name__ == "__main__":
    parse = argparse.ArgumentParser(
        description='multi-process batch processing tool')
    parse.add_argument('--num_proc', type=int, default=20)
    parse.add_argument(
        '--shell_cmd_list_filename',
        type=str,
        help='a txt file contains shell command list to be execute.')
    args = parse.parse_args()
    main(args)


================================================
FILE: ppfleetx/utils/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: ppfleetx/utils/check.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

import paddle
from paddle import is_compiled_with_cuda
from .log import logger
from .device import get_device_and_mapping

def check_version():
    """
    Log error and exit when the installed version of paddlepaddle is
    not satisfied.
    """
    err = "PaddlePaddle version 1.8.0 or higher is required, " \
          "or a suitable develop version is satisfied as well. \n" \
          "Please make sure the version is good with your code."
    try:
        pass
        # paddle.utils.require_version('0.0.0')
    except Exception:
        logger.error(err)
        sys.exit(1)


def check_device(device):
    """
    Log error and exit when using paddlepaddle cpu version.
    """
    err = "You are using paddlepaddle %s version! Please try to \n" \
          "1. install paddlepaddle-%s to run model on %s \nor 2. set the config option 'Global.device' to %s."

    d, supported_device_map = get_device_and_mapping()

    assert device in supported_device_map, \
        f"the device({device}) to check is not supported by now.Now the paddle only supports: {supported_device_map.keys()}"
    err = err % (d, device, device, d)
    
    try:
        assert supported_device_map[device]
    except AssertionError:
        logger.error(err)
        sys.exit(1)


================================================
FILE: ppfleetx/utils/compression_helper.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddleslim


def get_pruned_params(model):
    params = []
    for sublayer in model.sublayers():
        for param in sublayer.parameters(include_sublayers=False):
            if isinstance(sublayer,
                          paddle.nn.layer.common.Linear) or isinstance(
                              sublayer, paddle.distributed.fleet.layers.mpu.
                              mp_layers.ColumnParallelLinear) or isinstance(
                                  sublayer, paddle.distributed.fleet.layers.
                                  mpu.mp_layers.RowParallelLinear):
                if len(param.shape) != 2: continue

                # NOTE(minghaoBD):
                # 1. param.shape[1] == 3 * param.shape[0]： prune fused-qkv's weight and its next weight: out-linear's weight
                # 2. param.shape[1] == 4 * param.shape[0]： prune ffn1's weight and its next weight: ffn2's weight
                # If your model has a different architecture, like your qkv's weights are not fused or ffn1_weight.shape[1] != 4*ffn1_weight.shape[0], you may need to customize this function to suit your model.
                if param.shape[1] == 3 * param.shape[0] or param.shape[
                        1] == 4 * param.shape[0]:
                    params.append(param.name)

    return params


def prune_model(model, configs, inputs_desc=[]):
    prune_criterion = configs.criterion
    ratio = configs.ratio
    shapes, dtypes = [], []
    for input_desc in inputs_desc:
        dtypes.append(input_desc.dtype)
        new_shape = [10 if item == -1 else item for item in input_desc.shape]
        shapes.append(new_shape)
    #TODO(minghaoBD): support ViT and other model architectures in the future
    num_attention_heads = model.gpt.decoder.layers[0].self_attn.num_heads

    if prune_criterion == 'l1_norm':
        pruner = paddleslim.L1NormFilterPruner(
            model,
            shapes,
            skip_leaves=False,
            prune_type='fc',
            input_dtype=dtypes[0],
            num_head=num_attention_heads)
    elif prune_criterion == 'l2_norm':
        pruner = paddleslim.L2NormFilterPruner(
            model,
            shapes,
            skip_leaves=False,
            prune_type='fc',
            input_dtype=dtypes[0],
            num_head=num_attention_heads)
    params = get_pruned_params(model)
    ratios = {}
    for param in params:
        ratios[param] = ratio
    #NOTE(minghaoBD): hidden size in Layernorm must be 768/1024/2048/4096 for best inference performace, and when axis=0, the hidden size in layernorm will be changed accordingly. So axis=1 is required.
    plan = pruner.prune_vars(ratios, [1])


def quant_model(model, configs):
    quanter = paddleslim.dygraph.quant.QAT(configs)
    return quanter.quantize(model), quanter


================================================
FILE: ppfleetx/utils/config.py
================================================
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import copy
import argparse
import yaml
import codecs
import sys
import logging
from .log import logger, advertise

from . import check
import paddle
import paddle.distributed as dist
import paddle.distributed.auto_parallel as auto
from paddle.fluid.reader import use_pinned_memory

__all__ = ['get_config', 'print_config']


def process_dist_config(configs):
    """
    process distributed strategy for hybrid parallel
    """
    nranks = dist.get_world_size()

    config = configs['Distributed']

    config.setdefault("hcg", "HybridCommunicateGroup")
    mp_degree = config.setdefault("mp_degree", 1)
    pp_degree = config.setdefault("pp_degree", 1)
    pp_recompute_interval = config.setdefault("pp_recompute_interval", 1)

    # sharding default
    sharding_config = config['sharding']
    sharding_degree = sharding_config.setdefault("sharding_degree", 1)
    sharding_stage = sharding_config.setdefault('sharding_stage', 2)
    sharding_offload = sharding_config.setdefault('sharding_offload', False)
    reduce_overlap = sharding_config.setdefault('reduce_overlap', False)
    broadcast_overlap = sharding_config.setdefault('broadcast_overlap', False)

    other_degree = mp_degree * pp_degree * sharding_degree

    assert nranks % other_degree == 0, "unreasonable config of dist_strategy."
    dp_degree = config.setdefault("dp_degree", nranks // other_degree)
    assert nranks % dp_degree == 0, "unreasonable config of dist_strategy."
    assert nranks == dp_degree * other_degree, \
        "Mismatched config using {} cards with dp_degree[{}]," \
            "mp_degree[{}], pp_degree[{}] and sharding_degree[{}]".format(nranks, \
                dp_degree, mp_degree, pp_degree, sharding_degree)

    if sharding_config['sharding_degree'] > 1 and reduce_overlap:
        if sharding_config['sharding_stage'] == 3 or sharding_config[
                'sharding_offload']:
            sharding_config['reduce_overlap'] = False
            logger.warning(
                "reduce overlap only valid for sharding stage 2 without offload"
            )

    if sharding_config['sharding_degree'] > 1 and broadcast_overlap:
        if sharding_config['sharding_stage'] == 3 or sharding_config[
                'sharding_offload']:
            sharding_config['broadcast_overlap'] = False
            logger.warning(
                "broadcast overlap only valid for sharding stage 2 without offload"
            )

    if broadcast_overlap and configs['Engine']['logging_freq'] == 1:
        logger.warning(
            "Set logging_freq to 1 will disable broadcast_overlap. "
            "If you want to overlap the broadcast, please increase the logging_freq."
        )
        sharding_config['broadcast_overlap'] = False

    if sharding_config['sharding_degree'] > 1:
        if getattr(sharding_config, 'broadcast_overlap', False):
            logger.warning(
                "Enable broadcast overlap for sharding will not use pin memory for dataloader"
            )
            use_pinned_memory(False)

    if 'fuse_sequence_parallel_allreduce' not in config:
        config['fuse_sequence_parallel_allreduce'] = False

    if 'use_main_grad' in config and config['use_main_grad'] is True:
        logger.warning(
            "If use_main_grad is True, fuse_sequence_parallel_allreduce will be forced to False"
        )
        config['fuse_sequence_parallel_allreduce'] = False


def process_global_configs(config):
    """
    process global configs for hybrid parallel
    """
    dp_degree = config['Distributed']['dp_degree']
    pp_degree = config['Distributed']['pp_degree']
    sharding_degree = config['Distributed']['sharding']['sharding_degree']

    config['Global']['enable_partial_send_recv'] = True
    if 'sequence_parallel' in config['Model'] and pp_degree > 1:
        if config['Model']['sequence_parallel']:
            config['Global']['enable_partial_send_recv'] = False
            logger.warning(
                "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \
                "config.Global.enable_partial_send_recv will be set False."
            )

    global_cfg = config['Global']

    # Set environment variable
    flags = global_cfg.get("flags", {})
    paddle.set_flags(flags)
    for k, v in flags.items():
        logger.info("Environment variable {} is set {}.".format(k, v))

    if global_cfg['global_batch_size'] is None and global_cfg[
            'local_batch_size'] is None:
        raise ValueError(
            "global_batch_size or local_batch_size should be set.")
    elif global_cfg['global_batch_size'] is not None and global_cfg[
            'local_batch_size'] is not None:
        assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == (dp_degree * sharding_degree), "global_batch_size[{}] should be divided by local_batch_size[{}] "\
            "when dp_degree is [{}] and sharding_degree is [{}]".format(global_cfg['global_batch_size'],
            global_cfg['local_batch_size'], dp_degree, sharding_degree)
    elif global_cfg['global_batch_size'] is not None and global_cfg[
            'local_batch_size'] is None:
        assert global_cfg['global_batch_size'] % (dp_degree * sharding_degree) == 0, \
            "global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]"\
            .format(global_cfg['global_batch_size'], dp_degree, sharding_degree)
        global_cfg['local_batch_size'] = global_cfg['global_batch_size'] // (
            dp_degree * sharding_degree)
    else:
        global_cfg['global_batch_size'] = global_cfg[
            'local_batch_size'] * dp_degree * sharding_degree
    assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0


def process_engine_config(config):
    """
    process engine
    """
    # save_load
    config.Engine['save_load'] = config.Engine.get('save_load', {})
    save_load_cfg = config.Engine.save_load
    save_steps = save_load_cfg.get('save_steps', None)
    save_epoch = save_load_cfg.get('save_epoch', None)
    if save_steps is None or save_steps == -1:
        save_load_cfg[
            'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint

    if save_epoch is None or save_epoch == -1:
        save_load_cfg['save_epoch'] = 1

    save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output')
    save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None)

    # mix_precision
    config.Engine['mix_precision'] = config.Engine.get('mix_precision', {})
    amp_cfg = config.Engine.mix_precision

    amp_cfg['enable'] = amp_cfg.get('enable', False)
    amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768)
    amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None)
    amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None)

    # engine
    config.Engine['max_steps'] = config.Engine.get('max_steps', 500000)
    config.Engine['eval_freq'] = config.Engine.get('eval_freq', -1)
    config.Engine['eval_iters'] = config.Engine.get('eval_iters', 0)
    config.Engine['logging_freq'] = config.Engine.get('logging_freq', 1)
    config.Engine['num_train_epochs'] = config.Engine.get('num_train_epochs',
                                                          1)
    config.Engine['test_iters'] = config.Engine['eval_iters'] * 10 \
            if config.Engine.get('test_iters', None) is None else config.Engine['test_iters']
    config.Engine[
        'accumulate_steps'] = config.Global.local_batch_size // config.Global.micro_batch_size


class AttrDict(dict):
    def __getattr__(self, key):
        return self[key]

    def __setattr__(self, key, value):
        if key in self.__dict__:
            self.__dict__[key] = value
        else:
            self[key] = value

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        return result

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, copy.deepcopy(v, memo))
        for k, v in self.items():
            setattr(result, k, copy.deepcopy(v, memo))
        return result

    def setdefault(self, k, default=None):
        if k not in self or self[k] is None:
            self[k] = default
            return default
        else:
            return self[k]


def create_attr_dict(yaml_config):
    from ast import literal_eval
    for key, value in yaml_config.items():
        if type(value) is dict:
            yaml_config[key] = value = AttrDict(value)
        if isinstance(value, str):
            try:
                value = literal_eval(value)
            except BaseException:
                pass
        if isinstance(value, AttrDict):
            create_attr_dict(yaml_config[key])
        else:
            yaml_config[key] = value


def parse_config(cfg_file):
    """Load a config file into AttrDict"""

    def _update_dic(dic, base_dic):
        '''Update config from dic based base_dic
        '''
        base_dic = base_dic.copy()
        dic = dic.copy()

        if dic.get('_inherited_', True) == False:
            dic.pop('_inherited_')
            return dic

        for key, val in dic.items():
            if isinstance(val, dict) and key in base_dic:
                base_dic[key] = _update_dic(val, base_dic[key])
            else:
                base_dic[key] = val
        dic = base_dic
        return dic

    def _parse_from_yaml(path):
        '''Parse a yaml file and build config'''

        with codecs.open(path, 'r', 'utf-8') as file:
            dic = yaml.load(file, Loader=yaml.FullLoader)

        if '_base_' in dic:
            cfg_dir = os.path.dirname(path)
            base_path = dic.pop('_base_')
            base_path = os.path.join(cfg_dir, base_path)
            base_dic = _parse_from_yaml(base_path)
            dic = _update_dic(dic, base_dic)
        return dic

    yaml_dict = _parse_from_yaml(cfg_file)
    yaml_config = AttrDict(yaml_dict)

    create_attr_dict(yaml_config)
    return yaml_config


def print_dict(d, delimiter=0):
    """
    Recursively visualize a dict and
    indenting acrrording by the relationship of keys.
    """
    placeholder = "-" * 60
    for k, v in sorted(d.items()):
        if isinstance(v, dict):
            logger.info("{}{} : ".format(delimiter * " ", k))
            print_dict(v, delimiter + 4)
        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
            logger.info("{}{} : ".format(delimiter * " ", k))
            for value in v:
                print_dict(value, delimiter + 4)
        else:
            logger.info("{}{} : {}".format(delimiter * " ", k, v))
        if k.isupper():
            logger.info(placeholder)


def print_config(config):
    """
    visualize configs
    Arguments:
        config: configs
    """
    advertise()
    print_dict(config)


def check_config(config):
    """
    Check config
    """
    # global_batch_size = config.get("")

    global_config = config.get('Global')
    check.check_version()
    device = global_config.get('device', 'gpu')
    device = device.lower()
    if device in ['gpu', 'xpu', 'rocm', 'npu', "cpu", 'mlu']:
        check.check_device(device)
    else:
        raise ValueError(
            f"device({device}) is not in ['gpu', 'xpu', 'rocm', 'npu', 'cpu', 'mlu'],\n"
            "Please ensure the config option Global.device is one of these devices"
        )


def override(dl, ks, v):
    """
    Recursively replace dict of list
    Args:
        dl(dict or list): dict or list to be replaced
        ks(list): list of keys
        v(str): value to be replaced
    """

    def str2num(v):
        try:
            return eval(v)
        except Exception:
            return v

    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
    assert len(ks) > 0, ('lenght of keys should larger than 0')
    if isinstance(dl, list):
        k = str2num(ks[0])
        if len(ks) == 1:
            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
            dl[k] = str2num(v)
        else:
            override(dl[k], ks[1:], v)
    else:
        if len(ks) == 1:
            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
            if not ks[0] in dl:
                print('A new field ({}) detected!'.format(ks[0], dl))
            dl[ks[0]] = str2num(v)
        else:
            if ks[0] not in dl.keys():
                dl[ks[0]] = {}
                print("A new Series field ({}) detected!".format(ks[0], dl))
            override(dl[ks[0]], ks[1:], v)


def override_config(config, options=None):
    """
    Recursively override the config
    Args:
        config(dict): dict to be replaced
        options(list): list of pairs(key0.key1.idx.key2=value)
            such as: [
                'topk=2',
                'VALID.transforms.1.ResizeImage.resize_short=300'
            ]
    Returns:
        config(dict): replaced config
    """
    if options is not None:
        for opt in options:
            assert isinstance(opt, str), (
                "option({}) should be a str".format(opt))
            assert "=" in opt, (
                "option({}) should contain a ="
                "to distinguish between key and value".format(opt))
            pair = opt.split('=')
            assert len(pair) == 2, ("there can be only a = in the option")
            key, value = pair
            keys = key.split('.')
            override(config, keys, value)
    return config


def get_config(fname, overrides=None, show=False):
    """
    Read config from file
    """
    assert os.path.exists(fname), (
        'config file({}) is not exist'.format(fname))
    config = parse_config(fname)
    override_config(config, overrides)

    process_dist_config(config)
    process_global_configs(config)
    process_engine_config(config)
    create_attr_dict(AttrDict(config))

    if show:
        print_config(config)
    check_config(config)
    return config


def process_auto_dist_configs(config):
    """
    process distributed strategy for auto parallel
    """
    configs = config['Distributed']
    nranks = dist.get_world_size()

    mp_degree = configs.setdefault("mp_degree", 1)
    pp_degree = configs.setdefault("pp_degree", 1)
    sharding_config = configs['sharding']
    sharding_degree = sharding_config.setdefault("sharding_degree", 1)

    other_degree = mp_degree * pp_degree
    assert nranks % other_degree == 0, "Requires nranks should be divided by mp_degree*pp_degree."

    dp_degree = configs.setdefault("dp_degree", nranks // other_degree)
    assert nranks % dp_degree == 0, "unreasonable config of dist_strategy."
    assert nranks == dp_degree * other_degree, \
        "Mismatched config using {} cards with dp_degree[{}]," \
            "mp_degree[{}], pp_degree[{}] and sharding_degree[{}]".format(nranks, \
                dp_degree, mp_degree, pp_degree, sharding_degree)


def process_auto_global_configs(config):
    """
    process global configs for auto parallel
    """
    dp_degree = config['Distributed']['dp_degree']
    pp_degree = config['Distributed']['pp_degree']
    # sharding_degree = config['Distributed']['sharding_degree']

    config['Global']['enable_partial_send_recv'] = True
    if config.get('Model', None) is not None and 'sequence_parallel' in config[
            'Model'] and pp_degree > 1:
        if config['Model']['sequence_parallel']:
            config['Global']['enable_partial_send_recv'] = False
            logger.warning(
                "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \
                "config.Global.enable_partial_send_recv will be set False."
            )

    global_cfg = config['Global']
    if global_cfg['global_batch_size'] is None and global_cfg[
            'local_batch_size'] is None:
        raise ValueError(
            "global_batch_size or local_batch_size should be set.")
    elif global_cfg['global_batch_size'] is not None and global_cfg[
            'local_batch_size'] is not None:
        assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == dp_degree, \
            "global_batch_size[{}] should be divided by local_batch_size[{}] when dp_degree is [{}]"\
                .format(global_cfg['global_batch_size'], global_cfg['local_batch_size'], dp_degree)
    elif global_cfg['global_batch_size'] is not None and global_cfg[
            'local_batch_size'] is None:
        assert global_cfg['global_batch_size'] % dp_degree == 0, \
            "global_batch_size[{}] should be divided by dp_degree[{}]".format(global_cfg['global_batch_size'], dp_degree)
        global_cfg['local_batch_size'] = global_cfg[
            'global_batch_size'] // dp_degree
    else:
        global_cfg['global_batch_size'] = global_cfg[
            'local_batch_size'] * dp_degree
    assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0


def process_auto_engine_configs(config):
    """
    process engine configs for auto parallel
    """
    if config.Engine.get("verbose", None) is None:
        config.Engine["verbose"] = 2
    if config.Engine.get("logging_freq", None) is None:
        config.Engine["logging_freq"] = 10
    config.Engine['save_load'] = config.Engine.get('save_load', {})
    save_load_cfg = config.Engine.save_load
    save_steps = save_load_cfg.get('save_steps', None)
    save_epoch = save_load_cfg.get('save_epoch', None)
    if save_steps is None or save_steps == -1:
        save_load_cfg[
            'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint
    if save_epoch is None or save_epoch == -1:
        save_load_cfg['save_epoch'] = 1
    save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output')
    save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None)

    config.Engine['max_steps'] = config.Engine.get('max_steps', 500000)
    config.Engine['eval_freq'] = config.Engine.get('eval_freq', -1)
    config.Engine['eval_iters'] = config.Engine.get('eval_iters', 0)
    config.Engine['logging_freq'] = config.Engine.get('logging_freq', 1)
    config.Engine['num_train_epochs'] = config.Engine.get('num_train_epochs',
                                                          1)

    config.Engine['test_iters'] = config.Engine['eval_iters'] * 10 \
            if config.Engine.get('test_iters', None) is None else config.Engine['test_iters']

    config.Engine[
        'accumulate_steps'] = config.Global.local_batch_size // config.Global.micro_batch_size


def process_auto_strategy(config):
    """
    process auto strategy for auto parallel
    """
    strategy = auto.Strategy()
    strategy.auto_mode = "semi"
    strategy.seed = config['Global']['seed']

    # amp config
    amp_cfg = config.Engine.get('mix_precision', {})
    amp = strategy.amp
    amp.enable = amp_cfg.get('enable', False)
    amp.dtype = amp_cfg.get('dtype', "float16")
    amp.level = amp_cfg.get('level', "o2")
    amp.init_loss_scaling = amp_cfg.get('scale_loss', 32768)
    amp.custom_black_list = amp_cfg.get('custom_black_list', [])
    amp.custom_white_list = amp_cfg.get('custom_white_list', [])
    amp.use_fp16_guard = amp_cfg.get('use_fp16_guard', False)
    amp.use_bf16_guard = amp_cfg.get('use_bf16_guard', False)

    # recompute config
    if config.get('Model', None) is not None:
        if not config.Model.get('no_recompute_layers', None):
            config.Model['no_recompute_layers'] = []
        else:
            assert isinstance(config.Model['no_recompute_layers'],
                              list), "no_recompute_layers should be a list"
            for i in config.Model['no_recompute_layers']:
                assert isinstance(
                    i, int
                ), "all values in no_recompute_layers should be an integer"
            assert min(config.Model['no_recompute_layers']) >= 0, \
                "the min value in no_recompute_layers should >= 0"
            assert max(config.Model['no_recompute_layers']) < config.Model['num_layers'], \
                "the max value in no_recompute_layers should < num_layers"
            config.Model['no_recompute_layers'] = sorted(
                list(set(config.Model['no_recompute_layers'])))
        recompute = strategy.recompute
        recompute.enable = config.Model.get('use_recompute', False)
        recompute.no_recompute_segments = config.Model.pop(
            'no_recompute_layers', [])
        recompute.enable_tuning = config.get(
            'Tuning', False) and config.Tuning.get('tuning_recompute', False)

    # sharding config
    sharding_cfg = config.Distributed.get('sharding', {})
    sharding = strategy.sharding
    sharding.enable = sharding_cfg.get('sharding_degree', 1) > 1
    sharding.degree = sharding_cfg.get('sharding_degree', 1)
    sharding.stage = sharding_cfg.get('sharding_stage', 1)

    # gradient merge config
    gradient_merge = strategy.gradient_merge
    gradient_merge.enable = config.Engine.get('accumulate_steps') > 1
    gradient_merge.k_steps = config.Engine.get('accumulate_steps', 1)

    # quantization config
    qat_cfg = config.get('Quantization', {})
    qat = strategy.qat
    qat.enable = qat_cfg.get('enable', False)
    qat.channel_wise_abs_max = qat_cfg.get('channel_wise_abs_max', True)
    qat.weight_bits = qat_cfg.get('weight_bits', 8)
    qat.activation_bits = qat_cfg.get('activation_bits', 8)
    qat.onnx_format = qat_cfg.get('onnx_format', True)

    # tuning config
    tuning_cfg = config.get('Tuning', {})
    tuning = strategy.tuning
    tuning.enable = tuning_cfg.get('enable', False)
    tuning.profile_start_step = tuning_cfg.get('profile_start_step', 1)
    tuning.profile_end_step = tuning_cfg.get('profile_end_step', 1)
    tuning.run_after_tuning = tuning_cfg.get('run_after_tuning', True)
    tuning.debug = tuning_cfg.get('debug', True)

    engine_cfg = config['Engine']
    engine_cfg['strategy'] = strategy


def process_auto_ckpt_dir(config):
    configs = config["Engine"]["save_load"]
    ckpt_dir = configs.get("ckpt_dir", None)
    if ckpt_dir is None:
        return

    assert os.path.isdir(ckpt_dir) == False, "Wrong setting of ckpt_dir!ckpt_dir can't be a folder,"\
        "but {} is a folder. Your `ckpt_dir` should be `dirname/prefix` like `output/auto`"\
            " if your model path is `output/auto_dist0.pdparams`".format(ckpt_dir)

    assert os.path.exists(ckpt_dir) == False, "Wrong setting of ckpt_dir,"\
        "if you want to load weight,you should set ckpt_dir like this!"\
        "for example:\ngpt_auto_model_save\n\t--auto_dist0.pdparams\n\t--auto_dist0.pdparams\n"\
        "\t--auto_dist0.pdattr\nyou should set ckpt_dir=\"gpt_auto_model_save/auto\""

    parent_path = os.path.split(ckpt_dir)[0]

    if os.path.exists(parent_path) == False:
        logging.warning("{} path is not existed!we will set ckpt_dir None.".
                        format(parent_path))
        configs["ckpt_dir"] == None


def get_auto_config(fname, overrides=None, show=False):
    """
    Read config from file for auto parallel
    """
    assert os.path.exists(fname), (
        'config file({}) is not exist'.format(fname))
    config = parse_config(fname)
    override_config(config, overrides)

    process_auto_dist_configs(config)
    process_auto_global_configs(config)
    process_auto_engine_configs(config)
    process_auto_strategy(config)
    process_auto_ckpt_dir(config)

    if show:
        print_config(config)
    check_config(config)
    return config


def parse_args():
    parser = argparse.ArgumentParser("train script")
    parser.add_argument(
        '-c',
        '--config',
        type=str,
        default='configs/config.yaml',
        help='config file path')
    parser.add_argument(
        '-o',
        '--override',
        action='append',
        default=[],
        help='config options to be overridden')
    args = parser.parse_args()
    return args


================================================
FILE: ppfleetx/utils/device.py
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from .log import logger


def get_device_and_mapping():
    """
        Return device type and name-bool mapping implifying which type is supported.
    """
    suppoted_device_map = {
        "gpu": paddle.is_compiled_with_cuda(),
        "xpu": paddle.is_compiled_with_xpu(),
        "rocm": paddle.is_compiled_with_rocm(),
        "npu": paddle.is_compiled_with_custom_device("npu"),
        "mlu": 'mlu' in paddle.device.get_all_custom_device_type(),
        "cpu": True
    }
    for d, v in suppoted_device_map.items():
        if v:
            return d, suppoted_device_map


def get_device():
    """
        Return the device with which the paddle is compiled, including 'gpu'(for rocm and gpu), 'npu', 'xpu', 'cpu'.
    """
    d, _ = get_device_and_mapping()
    return d


def synchronize():
    """
    Synchronize device, return True if succeeded, otherwise return False
    """
    device = paddle.get_device().split(":")[0]
    if device in ["gpu", "rocm"]:
        paddle.device.cuda.synchronize()
        return True
    elif device == "xpu":
        paddle.device.xpu.synchronize()
        return True
    elif device in paddle.device.get_all_custom_device_type():
        paddle.device.synchronize()
        return True
    else:
        logger.warning(
            "The synchronization is only supported on cuda and xpu now.")
    return False


================================================
FILE: ppfleetx/utils/download.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import requests
import shutil
from ppfleetx.utils.log import logger

from tqdm import tqdm
import paddle

DOWNLOAD_RETRY_LIMIT = 3


def is_url(path):
    """
    Whether path is URL.
    Args:
        path (string): URL string or not.
    """
    return path.startswith('http://') or path.startswith('https://')


def _map_path(url, root_dir):
    # parse path after download under root_dir
    fname = os.path.split(url)[-1]
    fpath = fname
    return os.path.join(root_dir, fpath)


def cached_path(url_or_path, cache_dir=None):
    if cache_dir is None:
        cache_dir = '~/.cache/ppfleetx/'

    cache_dir = os.path.expanduser(cache_dir)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir, exist_ok=True)

    if is_url(url_or_path):
        path = _map_path(url_or_path, cache_dir)
        url = url_or_path
    else:
        path = url_or_path
        url = None

    if os.path.exists(path):
        logger.info(
            f"Found {os.path.split(path)[-1]} in cache_dir: {cache_dir}.")
        return path

    download(url, path)
    return path


def _download(url, fullname):
    """
    Download from url, save to path.
    url (str): download url
    path (str): download to given path
    """
    retry_cnt = 0

    while not os.path.exists(fullname):
        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
            retry_cnt += 1
        else:
            raise RuntimeError("Download from {} failed. "
                               "Retry limit reached".format(url))

        logger.info("Downloading {}".format(url))

        try:
            req = requests.get(url, stream=True)
        except Exception as e:  # requests.exceptions.ConnectionError
            logger.info("Downloading {} failed {} times with exception {}".
                        format(url, retry_cnt + 1, str(e)))
            time.sleep(1)
            continue

        if req.status_code != 200:
            raise RuntimeError("Downloading from {} failed with code "
                               "{}!".format(url, req.status_code))

        # For protecting download interupted, download to
        # tmp_fullname firstly, move tmp_fullname to fullname
        # after download finished
        tmp_fullname = fullname + "_tmp"
        total_size = req.headers.get('content-length')
        with open(tmp_fullname, 'wb') as f:
            if total_size:
                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
                    for chunk in req.iter_content(chunk_size=1024):
                        f.write(chunk)
                        pbar.update(1)
            else:
                for chunk in req.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
        shutil.move(tmp_fullname, fullname)

    return fullname


def download(url, path):
    local_rank = 0
    world_size = 1
    if paddle.fluid.core.is_compiled_with_dist(
    ) and paddle.distributed.get_world_size() > 1:
        local_rank = paddle.distributed.ParallelEnv().dev_id
        world_size = paddle.distributed.get_world_size()
    if world_size > 1 and local_rank != 0:
        while not os.path.exists(path):
            time.sleep(1)
    else:
        _download(url, path)


================================================
FILE: ppfleetx/utils/export.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import paddle
import logging

from .log import logger

__all__ = ['export_inference_model']


def _prune_input_spec(input_spec, program, targets):
    # try to prune static program to figure out pruned input spec
    # so we perform following operations in static mode
    device = paddle.get_device()
    paddle.enable_static()
    paddle.set_device(device)
    pruned_input_spec = []
    program = program.clone()
    program = program._prune(targets=targets)
    global_block = program.global_block()
    for spec in input_spec:
        try:
            v = global_block.var(spec.name)
            pruned_input_spec.append(spec)
        except Exception:
            pass
    paddle.disable_static(place=device)
    return pruned_input_spec


def export_inference_model(
        model,
        input_spec,
        save_dir='./output',
        save_name='model',
        export_quant_model=False,
        quanter=None, ):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    static_model = paddle.jit.to_static(model, input_spec)
    pruned_input_spec = _prune_input_spec(input_spec,
                                          static_model.forward.main_program,
                                          static_model.forward.outputs)

    if export_quant_model:
        quanter.save_quantized_model(
            model,
            os.path.join(save_dir, save_name),
            input_spec=pruned_input_spec)
        logger.info("export quantized inference model saved in {}".format(
            save_dir))
        return

    paddle.jit.save(
        static_model,
        os.path.join(save_dir, save_name),
        input_spec=pruned_input_spec)
    logger.info("export inference model saved in {}".format(save_dir))


================================================
FILE: ppfleetx/utils/file.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import csv
import zipfile
import tarfile
from typing import Iterable, Callable

import paddle
from ppfleetx.distributed.apis import env


@env.work_at_local_rank0
def unzip(zip_path, mode="r", out_dir=None, delete=False):
    with zipfile.ZipFile(zip_path, mode) as zip_ref:
        zip_ref.extractall(out_dir)

    if delete:
        os.remove(zip_path)


@env.work_at_local_rank0
def untar(tar_path, mode="r:gz", out_dir=None, delete=False):
    try:
        with tarfile.open(tar_path, 'r:gz') as f:
            f.extractall(out_dir)
    finally:
        if delete:
            os.remove(tar_path)


def parse_csv(path,
              skip_lines=0,
              delimiter=' ',
              quotechar='|',
              quoting=csv.QUOTE_NONE,
              map_funcs=None,
              filter_funcs=None):

    with open(path, newline='') as csvfile:
        data = []
        spamreader = csv.reader(
            csvfile, delimiter=delimiter, quotechar=quotechar, quoting=quoting)
        for idx, row in enumerate(spamreader):
            if idx < skip_lines:
                continue
            filter_flag = True
            if filter_funcs is not None:
                if isinstance(filter_funcs, Iterable):
                    for func in filter_funcs:
                        filter_flag = func(row)
                        if filter_flag is False:
                            break
                else:
                    assert isinstance(filter_funcs, Callable)
                    filter_flag = filter_funcs(row)
            if filter_flag is False:
                continue

            if map_funcs is not None:
                if isinstance(map_funcs, Iterable):
                    for func in map_funcs:
                        row = func(row)
                else:
                    assert isinstance(map_funcs, Callable)
                    row = map_funcs(row)
            data.append(row)
        return data


================================================
FILE: ppfleetx/utils/log.py
================================================
# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib
import copy
import functools
import logging
import os
import sys
import time
import datetime
import threading
from typing import List

import colorlog
from colorama import Fore

import paddle

loggers = {}

log_config = {
    'DEBUG': {
        'level': 10,
        'color': 'purple'
    },
    'INFO': {
        'level': 20,
        'color': 'green'
    },
    'TRAIN': {
        'level': 21,
        'color': 'cyan'
    },
    'EVAL': {
        'level': 22,
        'color': 'blue'
    },
    'WARNING': {
        'level': 30,
        'color': 'yellow'
    },
    'ERROR': {
        'level': 40,
        'color': 'red'
    },
    'CRITICAL': {
        'level': 50,
        'color': 'bold_red'
    }
}


class Logger(object):
    '''
    Deafult logger in PaddleFleetX

    Args:
        name(str) : Logger name, default is 'PaddleFleetX'
    '''

    def __init__(self, name: str=None):
        name = 'PaddleFleetX' if not name else name
        self.logger = logging.getLogger(name)

        for key, conf in log_config.items():
            logging.addLevelName(conf['level'], key)
            self.__dict__[key] = functools.partial(self.__call__,
                                                   conf['level'])
            self.__dict__[key.lower()] = functools.partial(self.__call__,
                                                           conf['level'])

        self.format = colorlog.ColoredFormatter(
            '%(log_color)s[%(asctime)-15s] [%(levelname)s]%(reset)s - %(message)s',
            log_colors={
                key: conf['color']
                for key, conf in log_config.items()
            })

        self.handler = logging.StreamHandler()
        self.handler.setFormatter(self.format)

        self.logger.addHandler(self.handler)
        self.logLevel = 'DEBUG'
        self.logger.setLevel(logging.DEBUG)
        self.logger.propagate = False
        self._is_enable = True

    def disable(self):
        self._is_enable = False

    def enable(self):
        self._is_enable = True

    @property
    def is_enable(self) -> bool:
        return self._is_enable

    def __call__(self, log_level: str, msg: str):
        if not self.is_enable:
            return

        self.logger.log(log_level, msg)

    @contextlib.contextmanager
    def use_terminator(self, terminator: str):
        old_terminator = self.handler.terminator
        self.handler.terminator = terminator
        yield
        self.handler.terminator = old_terminator

    @contextlib.contextmanager
    def processing(self, msg: str, interval: float=0.1):
        '''
        Continuously print a progress bar with rotating special effects.

        Args:
            msg(str): Message to be printed.
            interval(float): Rotation interval. Default to 0.1.
        '''
        end = False

        def _printer():
            index = 0
            flags = ['\\', '|', '/', '-']
            while not end:
                flag = flags[index % len(flags)]
                with self.use_terminator('\r'):
                    self.info('{}: {}'.format(msg, flag))
                time.sleep(interval)
                index += 1

        t = threading.Thread(target=_printer)
        t.start()
        yield
        end = True


logger = Logger()


def advertise():
    """
    Show the advertising message like the following:
    ===========================================================
    ==        PaddleFleetX is powered by PaddlePaddle !        ==
    ===========================================================
    ==                                                       ==
    ==   For more info please go to the following website.   ==
    ==                                                       ==
    ==       https://github.com/PaddlePaddle/PaddleFleetX    ==
    ===========================================================
    """
    copyright = "PaddleFleetX is powered by PaddlePaddle !"
    ad = "For more info please go to the following website."
    website = "https://github.com/PaddlePaddle/PaddleFleetX"
    AD_LEN = 6 + len(max([copyright, ad, website], key=len))

    logger.info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
        "=" * (AD_LEN + 4),
        "=={}==".format(copyright.center(AD_LEN)),
        "=" * (AD_LEN + 4),
        "=={}==".format(' ' * AD_LEN),
        "=={}==".format(ad.center(AD_LEN)),
        "=={}==".format(' ' * AD_LEN),
        "=={}==".format(website.center(AD_LEN)),
        "=" * (AD_LEN + 4), ))

from .device import synchronize
def get_timestamp():
    if synchronize():
        return time.time()
    else:
        logger.warning(f"Device synchronizing failed, which may result uncorrect time")
    return time.time()

def convert_timestamp_to_data(timeStamp):
    return str(datetime.timedelta(seconds=int(timeStamp)))


================================================
FILE: ppfleetx/utils/tensor_fusion_helper.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from paddle.framework import core
import numpy as np
from collections import OrderedDict

from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_storage import ParamStorage, GradStorage
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import Type

alignment = {"gpu": 256, }
align = {
    Type.fp16.value: 2,
    Type.fp32.value: 4,
}


def assign_group_by_size(parameters, group_size=256 * 1024 * 1024):
    is_sparse_gradient = [False] * len(parameters)

    group_indices = core.eager_assign_group_by_size(
        parameters, is_sparse_gradient, [group_size, group_size])

    var_groups = OrderedDict()
    for group_idx, indices in enumerate(group_indices):
        for index in indices:
            var_groups.setdefault(group_idx, []).append(parameters[index])
    return var_groups


def flatten_dense_tensors(parameters):
    _buffer_size = 0
    _param2align = {}
    dtype = parameters[0].dtype

    for param in parameters:
        assert param.trainable, "param must be trainable..."
        size = np.prod(param.shape) * align[dtype]
        remaining = size % alignment["gpu"]
        ali = 0 if remaining == 0 else alignment["gpu"] - remaining
        align_ = ali // align[dtype]
        _buffer_size += np.prod(param.shape) + align_
        _param2align[param.name] = align_

    param_storage = ParamStorage(size=_buffer_size, dtype=dtype, device="gpu")

    param_storage.add_rank_params(parameters, _param2align)

    # process gradient
    grad_storage = GradStorage(
        size=_buffer_size,
        dtype=dtype,
        device="gpu",
        destination="0",
        parm2align=_param2align)

    for param in parameters:
        grad_storage.add_grad(param, _param2align[param.name])

    # param_storage --> grad_storage
    param_storage.buffer._copy_gradient_from(grad_storage.buffer)
    param_storage.buffer.stop_gradient = False
    return param_storage, grad_storage


def obtain_storage(parameters):
    if len(parameters) < 1:
        return []

    var_groups = assign_group_by_size(parameters)
    storage = []
    for group_idx, parameters in var_groups.items():
        param_storage, grad_storage = flatten_dense_tensors(parameters)
        storage.append(param_storage.buffer)
    return storage


def fused_parameters(parameters, use_sharding=False):
    decay_params = []
    other_params = []

    for param in parameters:
        if not any(nd in param.name for nd in ["bias", "norm", "b_0"]):
            decay_params.append(param)
        else:
            other_params.append(param)

    decay_fused = decay_params if use_sharding else obtain_storage(
        decay_params)
    other_fused = other_params if use_sharding else obtain_storage(
        other_params)
    all_fused = decay_fused + other_fused

    return decay_fused, all_fused


def all_reduce_parameters(params, group):
    if group.nranks < 2:
        return

    div_factor = 1.0 / group.nranks
    with paddle.framework.no_grad():
        for p in params:
            grad = p.grad.scale_(div_factor)
            paddle.distributed.all_reduce(grad, group=group)


================================================
FILE: ppfleetx/utils/version.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from ppfleetx.utils.log import logger

def version_check():
    version = paddle.version.full_version
    logger.info('run with paddle {}, commit id {}'.format(paddle.__version__, paddle.__git_commit__[:8]))
    if version != '0.0.0':
        paddle.utils.require_version(min_version='2.4.0')


================================================
FILE: projects/ernie/auto_export_ernie_345M_mp1.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_auto
rm -rf $log_dir

# 345M mp1 export
python -m paddle.distributed.launch --log_dir $log_dir --devices "0" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \
    -o Distributed.mp_degree=1 \


================================================
FILE: projects/ernie/auto_export_ernie_345M_mp2.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_auto
rm -rf $log_dir

# 345M mp2 export
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \
    -o Distributed.mp_degree=2 \


================================================
FILE: projects/ernie/auto_export_ernie_345M_mp2_npu.sh
================================================

#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_auto
rm -rf $log_dir

# 345M mp2 export
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \
    -o Distributed.mp_degree=2 \
    -o Global.device=npu 


================================================
FILE: projects/ernie/auto_export_ernie_345M_mp2_xpu.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_auto
rm -rf $log_dir

FILENAME=./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_base.yaml
sed -i "s/device: gpu/device: xpu/g" $FILENAME

export BKCL_PCIE_RING=1
# 345M mp2 export
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \
    -o Distributed.mp_degree=2 \


================================================
FILE: projects/ernie/docs/README.md
================================================
# ERNIE: Enhanced Representation through kNowledge IntEgration


## 1. 模型简介

ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框架，它将大数据预训练与多源丰富知识相结合，通过持续学习技术，不断吸收海量文本数据中词汇、结构、语义等方面的知识，实现模型效果不断进化。

ERNIE在情感分析、文本匹配、自然语言推理、词法分析、阅读理解、智能问答等16个公开数据集上全面显著超越世界领先技术，在国际权威的通用语言理解评估基准GLUE上，得分首次突破90分，获得全球第一。
相关创新成果也被国际顶级学术会议AAAI、IJCAI收录。
同时，ERNIE在工业界得到了大规模应用，如搜索引擎、新闻推荐、广告系统、语音交互、智能客服等。

ERNIE 通过建模海量数据中的词、实体及实体关系，学习真实世界的语义知识。相较于 BERT 学习原始语言信号，ERNIE 直接对先验语义知识单元进行建模，增强了模型语义表示能力。

这里我们举个例子：
```
Learnt by BERT ：哈 [mask] 滨是 [mask] 龙江的省会，[mask] 际冰 [mask] 文化名城。
Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [mask] 文化名城。
```
在 BERT 模型中，我们通过『哈』与『滨』的局部共现，即可判断出『尔』字，模型没有学习与『哈尔滨』相关的任何知识。而 ERNIE 通过学习词与实体的表达，使模型能够建模出『哈尔滨』与『黑龙江』的关系，学到『哈尔滨』是 『黑龙江』的省会以及『哈尔滨』是个冰雪城市。


### 1.1 目录结构

```text
.
├── docs
│   └── inference.md
│   └── README.md
├── auto_export_ernie_345M_mp1.sh           # 345M ernie-base模型，自动切分单卡导出
├── auto_export_ernie_345M_mp2.sh           # 345M ernie-base模型，自动切分多卡导出
├── auto_export_ernie_345M_mp2_xpu.sh       # 345M ernie-base模型，自动切分多卡导出（XPU）
├── export_ernie_345M_single_card.sh        # 345M ernie-base模型，单卡导出
├── finetune_ernie_345M_single_card.sh      # 345M ernie-base模型，单卡finetune训练
├── inference.py                            # ernie推理代码
├── pretrain_ernie_base_175B_mp8_pp16.sh    # 175B ernie-base模型，3D混合并行
├── pretrain_ernie_base_3D.sh               # ci测试
├── pretrain_ernie_base_6.7B_sharding16.sh  # 6.7B ernie-base模型，sharding16
├── pretrain_ernie_base.sh                  # 345M ernie-base模型，单卡
├── pretrain_ernie_large.sh                 # ernie-large模型，单卡
├── run_inference.sh                        # ernie 推理运行脚本 
├── run_inference_mp2.sh                    # ernie 多卡推理运行脚本 
└── run_inference_mp2_xpu.sh                # ernie 多卡推理运行脚本（XPU)

```


### 1.2 依赖环境

- paddlenlp
- pybind11

安装命令 `pip install pybind11 paddlenlp`


## 2.中文预训练

ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WWM（Whole Word Mask）方式，对于完整语义单元的Token，会同时进行Mask。整体的训练损失loss是mlm_loss + sop_loss。


### 2.1 小规模语料预训练: 14GB - CLUECorpusSmall

<details>
<summary><b>CLUECorpusSmall 数据准备</b></summary>

#### 数据准备
数据下载部分请参考[data_tools](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpusSmall.md)目录，根据文档中`CLUECorpusSmall 数据集处理教程`，下载数据。下载好后:

解压文件
```shell
unzip comment2019zh_corpus.zip -d  clue_corpus_small_14g/comment2019zh_corpus
unzip news2016zh_corpus.zip    -d  clue_corpus_small_14g/news2016zh_corpus
unzip webText2019zh_corpus.zip -d  clue_corpus_small_14g/webText2019zh_corpus
unzip wiki2019zh_corpus.zip    -d  clue_corpus_small_14g/wiki2019zh_corpus
```
将txt文件转换为jsonl格式
```
python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py  --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl
```
现在我们得到了jsonl格式的数据集，下面是针对训练任务的数据集应用，此处以ernie为例。
```
python -u  ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \
    --model_name ernie-1.0-base-zh \
    --tokenizer_name ErnieTokenizer \
    --input_path clue_corpus_small_14g.jsonl \
    --split_sentences\
    --chinese \
    --cn_whole_word_segment \
    --cn_seg_func jieba \
    --output_prefix clue_corpus_small_14g_20220104 \
    --workers 48 \
    --log_interval 10000
```
数据共有文档`15702702`条左右，由于分词比较耗时，大概一小时左右可以完成。在当前目录下产出训练所需数据。
```
clue_corpus_small_14g_20220104_ids.npy
clue_corpus_small_14g_20220104_idx.npz
```

</details>


<details>
<summary><b>CLUECorpusSmall 开始训练</b></summary>

#### 开始训练


将制作好的数据`clue_corpus_small_14g_20220104_ids.npy,clue_corpus_small_14g_20220104_idx.npz`移动到input_dir中，即可开始训练。


除了单卡训练，飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略，减少显存占用、加速训练，达到大模型可训练且训得快的效果。在模型训练前，需要根据模型规模选择合适的并行策略。下面分别从单卡训练、混合并行训练和自动并行训练三个方面来介绍ERNIE模型训练的配置文件和启动方式。


- 单卡训练

```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

# 345M
python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml 
```

- 混合并行

```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

# 175B run_pretrain
log_dir=log_175B
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml

```

## 3.下游任务微调
基于训练中产出的checkpoint，用户可以快速对当前模型效果进行评估。PaddleFleetX已经适配了主流下游任务 —— 序列分类，用户可以根据自己的需求，评估自己所需的数据集。

#### 运行实例

- 单卡训练

```
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

python tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml
```


- 数据并行

```
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

log_dir=log_dp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml \
    -o Model.use_recompute=True
```
</details>

## 3. 推理部署

[推理部署](inference.md)


================================================
FILE: projects/ernie/docs/inference.md
================================================
# 推理部署

模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。

## 1. 模型导出

以`ERNIE(345M)`模型为例


导出单卡`ERNIE(345M)`模型：
```bash
sh projects/ernie/auto_export_ernie_345M_mp1.sh
```

导出多卡`ERNIE(345M)`模型：
```bash
sh projects/ernie/auto_export_ernie_345M_mp2.sh
```

导出多卡`ERNIE(345M)`模型(XPU)：
```bash
sh projects/ernie/auto_export_ernie_345M_mp2_xpu.sh
```

## 2. 推理部署

模型导出后，可通过`tasks/ernie/inference.py`脚本进行推理部署。

`ERNIE(345M)` 推理
```bash
bash projects/ernie/run_inference.sh
```

`ERNIE(345M)` 多卡推理
```bash
bash projects/ernie/run_inference_mp2.sh
```

`ERNIE(345M)` 多卡推理(XPU)
```bash
bash projects/ernie/run_inference_mp2_xpu.sh
```

## 3. Benchmark

测试中


================================================
FILE: projects/ernie/export_ernie_345M_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0
python ./tools/export.py -c ./ppfleetx/configs/nlp/ernie/inference_ernie_345M_single_card.yaml 


================================================
FILE: projects/ernie/finetune_ernie_345M_single_card.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0
python tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml


================================================
FILE: projects/ernie/finetune_ernie_345M_single_card_npu.sh
================================================

#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


python tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml \
        -o Global.device=npu \
        -o Model.hidden_size=256


================================================
FILE: projects/ernie/inference.py
================================================
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))

import numpy as np
import paddle.distributed.fleet as fleet
from ppfleetx.data.tokenizers import GPTTokenizer
from ppfleetx.core.engine import InferenceEngine
import argparse


def parse_args():
    parser = argparse.ArgumentParser("ernie inference")
    parser.add_argument(
        '-m', '--model_dir', type=str, default='./output', help='model dir')
    parser.add_argument(
        '-mp', '--mp_degree', type=int, default=1, help='mp degree')
    parser.add_argument(
        '-d', '--device', type=str, default='', help='device type')
    args = parser.parse_args()
    return args


def main(args):
    fleet.init(is_collective=True)
    infer_engine = InferenceEngine(
        args.model_dir, args.mp_degree, device=args.device)
    tokenizer = GPTTokenizer.from_pretrained("gpt2")
    text = 'Hi ERNIE. Tell me who Jack Ma is.'
    inputs = tokenizer(text, padding=True, return_attention_mask=True)

    whole_data = [
        np.array(inputs['token_type_ids']).reshape(1, -1),
        np.array(inputs['input_ids']).reshape(1, -1)
    ]
    outs = infer_engine.predict(whole_data)
    print(outs)


if __name__ == "__main__":
    args = parse_args()
    main(args)


================================================
FILE: projects/ernie/pretrain_ernie_base.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=1
python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml 


================================================
FILE: projects/ernie/pretrain_ernie_base_175B_mp8_pp16.sh
================================================
#! /bin/bash
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_hybrid
rm -rf $log_dir

# 175B run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml


================================================
FILE: projects/ernie/pretrain_ernie_base_3D.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_hybrid
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \
    -o Data.Train.dataset.input_dir=./dataset/ernie \
    -o Data.Eval.dataset.input_dir=./dataset/ernie \
    -o Engine.max_steps=10


================================================
FILE: projects/ernie/pretrain_ernie_base_3D_npu.sh
================================================

#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_hybrid
rm -rf $log_dir
export PADDLE_P2P_SYNC_SEND=1

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \
    -o Data.Train.dataset.input_dir=./dataset/ernie \
    -o Data.Eval.dataset.input_dir=./dataset/ernie \
    -o Engine.max_steps=10 \
    -o Global.device=npu


================================================
FILE: projects/ernie/pretrain_ernie_base_6.7B_sharding16.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_hybrid
rm -rf $log_dir

# 6.7B+sharding16 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_6.7B_sharding16.yaml


================================================
FILE: projects/ernie/pretrain_ernie_large.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=1
python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml 


================================================
FILE: projects/ernie/pretrain_ernie_large_mp2_mlu.sh
================================================
#! /bin/bash
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export MLU_VISIBLE_DEVICES=0,1
export PADDLE_XCCL_BACKEND=mlu
export FLAGS_selected_mlus=0,1
LOG_DIR=log_ernie
LOG_GFILE=log_ernie_large_hybrid

mkdir -p ${LOG_DIR}

python -m paddle.distributed.launch \
       --log_dir ${LOG_DIR} \
       --device 0,1 tools/train.py \
       -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \
       -o Global.device=mlu \
       -o Distributed.mp_degree=2 \
       -o Distributed.dp_degree=1 \
       -o Distributed.pp_degree=1 \
       -o Model.use_recompute=Fasle > ${LOG_DIR}/${LOG_GFILE} 2>&1 &


================================================
FILE: projects/ernie/pretrain_ernie_large_mp2_npu.sh
================================================

#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -m paddle.distributed.launch \
        --device 0,1 tools/train.py \
        -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \
        -o Global.device=npu \
        -o Distributed.mp_degree=2 \
        -o Distributed.dp_degree=1 \
        -o Distributed.pp_degree=1 \
        -o Model.use_recompute=Fasle


================================================
FILE: projects/ernie/pretrain_ernie_large_mp2_pp2_npu.sh
================================================

#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export PADDLE_P2P_SYNC_SEND=1

python -m paddle.distributed.launch \
        --device 0,1,2,3 tools/train.py \
        -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \
        -o Global.device=npu \
        -o Distributed.mp_degree=2 \
        -o Distributed.dp_degree=1 \
        -o Distributed.pp_degree=2 \
        -o Model.use_recompute=True


================================================
FILE: projects/ernie/pretrain_ernie_large_npu.sh
================================================

#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \
        -o Global.device=npu


================================================
FILE: projects/ernie/run_inference.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

unset CUDA_VISIBLE_DEVICES

python -u -m paddle.distributed.launch \
    --gpus "0" \
    --log_dir "log" \
    projects/ernie/inference.py --model_dir "./output" --mp_degree 1


================================================
FILE: projects/ernie/run_inference_mp2.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

unset CUDA_VISIBLE_DEVICES

python -u -m paddle.distributed.launch \
    --gpus "0,1" \
    --log_dir "log" \
    projects/ernie/inference.py --model_dir "./output" --mp_degree 2


================================================
FILE: projects/ernie/run_inference_mp2_npu.sh
================================================
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python -u -m paddle.distributed.launch \
    --devices "0,1" \
    --log_dir "log" \
    projects/ernie/inference.py --model_dir "./output" --mp_degree 2 --device npu


================================================
FILE: projects/ernie/run_inference_mp2_xpu.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export BKCL_PCIE_RING=1
python -u -m paddle.distributed.launch \
    --devices "0,1" \
    --log_dir "log" \
    projects/ernie/inference.py --model_dir "./output" --mp_degree 2


================================================
FILE: projects/gpt/auto_export_gpt_175B_mp8.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_mp8
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_175B_mp8.yaml

================================================
FILE: projects/gpt/auto_export_gpt_345M_mp2.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_mp2
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_mp2.yaml \


================================================
FILE: projects/gpt/auto_export_gpt_345M_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_345m_mp1
rm -rf $log_dir

DIRECTORY=./pretrained
if [ ! -d "$DIRECTORY" ]; then
  echo "start download ckpt"
  wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_FP16.tar.gz
  tar -zxvf GPT_345M_FP16.tar.gz
fi

python -m paddle.distributed.launch --log_dir $log_dir --devices "1" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_single_card.yaml \
    -o Engine.save_load.ckpt_dir=./pretrained/auto


================================================
FILE: projects/gpt/auto_export_gpt_6.7B_mp1.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_mp1
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml

================================================
FILE: projects/gpt/auto_export_gpt_fp16_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python ./tools/auto_export.py -c ./ppfleetx/configs/nlp/gpt/auto/export_gpt_fp16_single_card.yaml \
    -o Engine.save_load.output_dir="./serial_model" \
    -o Engine.save_load.ckpt_dir="./output/rank_0/model" \


================================================
FILE: projects/gpt/auto_gpt_1.3B_dp8.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_auto
rm -rf $log_dir

# 1.3B+dp8 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml


================================================
FILE: projects/gpt/auto_gpt_1.3B_dp8_tuning.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_auto
rm -rf $log_dir

# 1.3B+dp8 recompute tuning
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8_tuning.yaml


================================================
FILE: projects/gpt/auto_gpt_1.3B_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export FLAGS_USE_STANDALONE_EXECUTOR=False
export CUDA_VISIBLE_DEVICES=0
python ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml 


================================================
FILE: projects/gpt/auto_gpt_345M_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export FLAGS_USE_STANDALONE_EXECUTOR=False
export CUDA_VISIBLE_DEVICES=0
python ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_345M_single_card.yaml 


================================================
FILE: projects/gpt/auto_gpt_6.7B_sharding16.sh
================================================
#! /bin/bash
# Runs the "1.3B" parameter model
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_auto
rm -rf $log_dir

# 6.7B+sharding16 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py \
    -c ./ppfleetx/configs/nlp/gp/auto/pretrain_gpt_6.7B_sharding16.yaml


================================================
FILE: projects/gpt/auto_qat_export_gpt_345M_mp2.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_auto
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \
    ./tools/auto_export.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/qat_generation_gpt_345M_mp2.yaml \
    -o Engine.save_load.output_dir="./mp2_qat_model" \


================================================
FILE: projects/gpt/benchmark.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import argparse
import numpy as np

import paddle
import paddle.distributed.fleet as fleet
from ppfleetx.core.engine.inference_engine import InferenceEngine
import ppfleetx_ops


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--seq_len",
        default=128,
        type=int,
        required=False,
        help="seq length of inputs")
    parser.add_argument(
        "--iter", default=100, type=int, help="run iterations for timing")
    parser.add_argument("--mp_degree", default=1, type=int, help="")
    parser.add_argument(
        "--model_dir", default="output", type=str, help="model directory")

    args = parser.parse_args()
    return args


def predict(engine, data, args):

    with engine._static_guard:
        for d, name in zip(data, engine.input_names()):
            handle = engine.predictor.get_input_handle(name)
            handle.copy_from_cpu(d)

        for _ in range(10):
            engine.predictor.run()
        engine.predictor.get_output_handle(engine.output_names()[
            0]).copy_to_cpu()

        start = time.perf_counter()
        for _ in range(args.iter):
            engine.predictor.run()
        end = time.perf_counter()
        print(
            f"batch {args.iter} run time: {1000 * (end - start) / args.iter}ms")

        return {name: engine.predictor.get_output_handle(name).copy_to_cpu() \
                for name in engine.output_names()}


def main():

    args = parse_args()

    fleet.init(is_collective=True)
    infer_engine = InferenceEngine(args.model_dir, args.mp_degree)
    ids = [100] * args.seq_len

    # run test
    for batch in [1, 2, 4, 8, 16]:

        whole_data = [ids] * batch
        whole_data = np.array(whole_data, dtype="int64").reshape(1, batch, -1)

        _ = predict(infer_engine, whole_data, args)


if __name__ == "__main__":
    main()


================================================
FILE: projects/gpt/docs/README.md
================================================
# GPT

## 模型介绍
GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)/[3](https://arxiv.org/pdf/2005.14165.pdf) 是以[Transformer](https://arxiv.org/abs/1706.03762) 解码器为网络基本组件，使用自回归的方式在大规模无标注文本语料上进行预训练得到的语言生成模型。

本项目是语言模型 GPT 的 PaddlePaddle 大模型实现。目前，PaddleFleetX 提供了 [GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型文件；分别基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl) 和 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) 数据集，采用 ACC(accuracy) 和 PPL(perplexity) 指标后的评估结果如下：

| **模型文件** | **ACC** | **PPL** |
|---------|-----------|---------------|
| GPT-345M | 44.17% |  18.01  |

下面是本例的简要目录结构及说明：

```text
.
├── auto_export_gpt_345M_mp2.sh            # 自动并行345M模型两卡张量并行导出入口
├── auto_gpt_345M_single_card.sh           # 自动并行345M模型单卡预训练入口
├── auto_gpt_1.3B_single_card.sh           # 自动并行1.3B模型单卡预训练入口
├── auto_gpt_1.3B_dp8.sh                   # 自动并行1.3B模型数据并行预训练入口
├── auto_gpt_6.7B_sharding16.sh            # 自动并行6.7B模型分组切片并行预训练入口
├── evaluate_gpt_345M_single_card.sh       # 单卡345M模型评估入口
├── export_gpt_345M_single_card.sh         # 单卡345M模型动转静导出入口
├── finetune_gpt_345M_single_card.sh       # 单卡345M模型finetune训练入口
├── inference_gpt_345M_single_card.sh      # 单卡345M模型推理入口
├── pretrain_gpt_345M_single_card.sh       # 单卡345M模型预训练入口
├── pretrain_gpt_1.3B_single_card.sh       # 单卡1.3B模型预训练入口
├── pretrain_gpt_1.3B_dp8.sh               # 8卡1.3B模型数据并行预训练入口
├── pretrain_gpt_6.7B_sharding16.sh        # 16卡6.7B模型分组切片并行预训练入口
├── pretrain_gpt_175B_mp8_pp16.sh          # 128卡175B模型混合并行预训练入口
├── qat_gpt_345M_single_card.sh            # 单卡345M模型量化训练入口
├── qat_gpt_345M_mp8.sh                    # 8卡345M模型模型并行量化训练入口
├── qat_gpt_6.7B_sharding16.sh             # 16卡6.7B模型分组切片并行量化训练入口
├── eval_qat_gpt_345M_single_card.sh       # 单卡345M量化模型验证入口
├── export_qat_gpt_345M_single_card.sh     # 单卡345M量化模型导出入口
```

## 快速开始

### 环境依赖

请确保已根据根目录 requirements.txt 安装所需依赖，或者通过以下命令快速安装

```shell
python -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple
```

### 数据准备

数据获取和制作详见[GPT 模型预训练数据准备流程](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/gpt)

为了方便用户运行测试本模型，此处提供处理好的300M的训练样本，在单卡训练或混合并行训练前都需要通过以下命令获取数据。

**数据下载命令**
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

# 下载样例数据
mkdir data && cd data
wget -O gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
wget -O gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz

cd .. # 回到 PaddleFleetX 根目录下
```

### 模型训练

除了单卡训练，飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略，减少显存占用、加速训练，达到大模型可训练且训得快的效果。在模型训练前，需要根据模型规模选择合适的并行策略。下面分别从单卡训练、混合并行训练和自动并行训练三个方面来介绍GPT模型训练的配置文件和启动方式。


- [单卡训练](./single_card.md)

- [混合并行训练](./hybrid_parallel.md)

- [自动并行训练](./auto_parallel.md)

### 文本生成体验

- [单卡预训练模型文本生成](./single_card.md#GPT-Zero-shot-文本生成)

- [混合并行预训练模型文本生成](./hybrid_parallel.md#GPT-Zero-shot-文本生成)


### 模型压缩

- [量化训练](./quantization_aware_training.md)

### 推理部署

- [推理部署](inference.md)
### GLUE 下游任务微调

- [单卡微调](./single_finetune.md)


## 参数释义


### 全局信息
全局参数指定训练的batch size，以及设备、随机种子等信息。
```yaml
  Global:
    device: gpu
    seed: 1024

    global_batch_size: 
    local_batch_size: 1
    micro_batch_size: 1
```

其中参数对应的释义如下：
| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| device | 设备信息 |
| seed | 随机数种子 |
| global_batch_size | 全局的batch size大小，即一次参数更新等效的batch size |
| local_batch_size  | 每个进程训练的batch size大小                  |
| micro_batch_size  | 每次前向计算的batch size大小                  |


### Engine训练控制

Engine训练设置完成模型训练/验证/推理等过程中的参数设置，是fleetX的EagerEngine的必要参数，所有使用该Engine都必须指定该配置。 其中包含的参数有：

```yaml
  Engine:
    max_steps: 500000
    num_train_epochs: 1
    accumulate_steps: 
    logging_freq: 1
    eval_freq: 500
    eval_iters: 10
    test_iters:
    mix_precision:
      enable: True
      dtype: "float16"
      level: "O2"
      scale_loss: 32768.0
      custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
      custom_white_list: ["lookup_table", "lookup_table_v2"]
    save_load:
      save_steps: 1000
      save_epoch: 1
      output_dir: ./output
      ckpt_dir:
```
其中参数对应的释义如下：

| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| max_steps         | 最大训练步数                               |
| num_train_epochs  | 训练的epoch数量                           |
| accumulate_steps  | 梯度累加次数                           |
| logging_freq      | 训练日志打印的频率                            |
| eval_freq         | 模型评估间隔                               |
| eval_iters        | 模型评估时训练评估测试集的轮数                      |
| test_iters        | 模型测试或推理时的轮数                      |
| enable            | 是否使用混合精度策略进行训练                     |
| dtype             | 混合精度训练数据类型使用float16还是bfloat16，默认为float16类型 |
| level             | 混合精度训练模式，默认``O2``模式                 |
| scale_loss        | 使用fp16混合精度策略下，loss的放缩比例                  |
| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 |
| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算 |
| save_steps        | 保存模型间隔step数                         |
| save_epoch        | 保存模型间隔epoch数                        |
| output_dir        | 指定输出文件                              |
| ckpt_dir          | checkpoint的加载目录                      |

### 模型网络

网络部分完成了网络的组网操作，GPT在[PaddleFleetX/ppfleetx/models/language_model/gpt/dygraph/single_model.py]((https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/dygraph/single_model.py))下。 
可以使用配置文件配置模型的规模，如：

```yaml
  Model:
    module: "GPTModule"
    name: "GPT"
    vocab_size: 50304
    hidden_size: 1024
    num_layers: 24
    num_attention_heads: 16
    ffn_hidden_size:
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 1024
    type_vocab_size: 16
    initializer_range: 0.02
    use_recompute: True
    recompute_granularity:
    no_recompute_layers:
    fused_linear: True
    fuse_attn_qkv: True
    sequence_parallel: False
```

其中参数对应的释义如下：
| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| module | 指定GPT模型的执行模块 ｜
| vocab_size                   | 训练词表大小                 |
| hidden_size                  | 隐藏层大小                  |
| num_layers                   | transformer层数          |
| num_attention_heads          | attention head的数量      |
| max_seq_len                  | 输入文本序列的长度              |
| ffn_hidden_size              | ffn层大小，一般为隐藏层的四倍       |
| attention_probs_dropout_prob | attention中的dropout的失活率 |
| max_position_embeddings      | position embedding的长度  |
| type_vocab_size              | 词表类型                   |
| initializer_range            | 参数初始化的范围               |
| use_recompute     | 是否使用recompute训练                      |
| recompute_granularity | recompute训练的粒度，可选 `full` `full_attn` `core_attn`，full即recompute全部transformer，full_attn表明只recompute所有self attention部分，core_attn表明只recompute `softmax(qkT)v` 部分。注：显存占用方面，`core_attn` > `full_attn` > `full`，若所选策略产生OOM错误，可以适当更改recompute_granularity |
|no_recompute_layers| list of integer，标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐，但是会适当的增加显存。若训练中发现有显存富裕，可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误，可以适当减小不进行recompute的层数。 ｜
| fused_linear      | 是否使用fused_linear代替传统Linear加速训练。注：该功能需要cuda 11.6及以上编译的paddle支持。       |
| fuse_attn_qkv     | 是否对attention层中的qkv计算使用fuse策略以加速训练 |
| sequence_parallel | 是否使用序列并行策略以加速训练。注：只有混合并行的GPT才支持该功能，它与张量模型并行共用通信组，当mp_degree=1时，序列并行策略会被强制关闭。 |
| virtual_pp_degree | 虚拟流水线并行维度，该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯，所以该参数的推荐值为2。并且，只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时，才可以使用虚拟流水线并行。 |
### 数据集

数据集参数分为“Train”、“Eval”和“Test”三部分，分别对应模型预训练、离线评估、推理等三个模块。

每个模型的配置参数都包含以下内容：

```yaml
  Data:
    Train:
      dataset:
        name: GPTDataset
        input_dir: ./data/
        split: [949, 50, 1]
        max_seq_len: 1024
      sampler:
        name: DistributedBatchSampler
        shuffle: False
        drop_last: True
      loader:
        num_workers: 1
        return_list: False
        collate_fn: gpt_collate_fn
```

其中参数对应的释义如下：
| **参数名**                      | **参数释义**               |
|------------------------------|------------------------|
| dataset.name         | 指定自定义数据集的名称  |
| input_dir         | 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件       |
| split             | 训练集，验证集和测试集的切分比例                     |
| max_seq_len       | 输入文本序列的长度                            |
| sampler.name         | 指定自定义采样器的名称  |
| shuffle         | 是否需要在生成样本下标时打乱顺序     |
| drop_last             | 是否需要丢弃最后无法凑整一个mini-batch的样本        |
| num_workers        | 用于加载数据的子进程个数  |
| return_list         | 每个设备上的数据是否以list形式返回    |
| collate_fn             | 通过此参数指定如果将样本列表组合为mini-batch数据；支持自定义     |


### 优化器


GPT训练默认使用AdamW优化器以及cosine学习率衰减，这里通过配置文件配置优化器的参数，如：

```yaml
  Optimizer:
    name: AdamW
    weight_decay: 0.01
    beta1: 0.9
    beta2: 0.999
    epsilon: 1.0e-8
    lr:
      name: CosineAnnealingWithWarmupDecay
      decay_steps: 360000
      warmup_rate: 0.01
      max_lr: 5.0e-5
      min_lr: 1.0e-5
    grad_clip:
      name: "ClipGradByGlobalNorm"
      clip_norm: 1.0
    tensor_fusion: False
```

其中参数说明：

| **参数名**      | **参数释义**                  |
|--------------|---------------------------|
| name | 指定自定义优化器的名称               |
| weight_decay | weight的衰减率                |
| beta1   | 一阶矩估计的指数衰减率               |
| beta2   | 二阶矩估计的指数衰减率               |
| epsilon | 指定优化器需要优化的参数              |
| lr.name | 指定自定义学习率策略的名称               |
| decay_steps  | 衰减的步长                     |
| warmup_rate  | warmup 率                  |
| max_lr       | Adam 的初始最大学习率             |
| min_lr       | Adam 的初始最小学习率             |
| grad_clip.name    | 指定自定义梯度裁剪策略的名称 |
| clip_norm    | 所允许的范数最大值 |
| tensor_fusion    | 是否使用tensor_fustion功能加速训练 |

另外，[Profiler](./hybrid_profiler.md)中还介绍了在 GPT 中开启 Profiler 并分析调试分析结果的方法及相关的参数解释。

### 模型压缩
PaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法：量化训练（Qutization Aware Training，QAT）、结构化稀疏（Structured Pruning，SP）和知识蒸馏（Knowledge Distillation，KD）。详细参数介绍见[模型压缩介绍](../../../docs/compression.md)。


## 参考文献
- [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
- [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165.pdf)
- [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)


================================================
FILE: projects/gpt/docs/auto_parallel.md
================================================
# GPT 自动并行模型训练

分布式并行训练技术使超大模型成为可能，但分布式训练程序的编写门槛较高，并行算法较为复杂，开发者需同时具有较好的工程能力和算法功底。为了降低分布式训练的难度，自动并行成为新的研究热点，受到学术界和工业界的广泛关注。自动并行通常分为半自动并行和全自动并行。半自动并行指的是开发者在单机脚本的基础上额外添加少量标注信息即可表达并行逻辑。而全自动并行则无需开发者添加任何并行逻辑，根据单机脚本自动搜索出较为高效的并行策略，实现分布式训练。


## 参数释义

### 全局信息
全局信息指定训练的 batch size，以及设备、随机种子等信息

```yaml
Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1
```

其中参数对应的释义如下：
| **参数名**                      | **参数释义**               |
|--------------------------------|---------------------------|
| device | 设备信息 |
| seed | 随机数种子 |
| global_batch_size | 全局的batch size大小，即一次参数更新等效的 batch size |
| local_batch_size  | 每个进程训练的batch size大小                        |
| micro_batch_size  | 每次前向计算的batch size大小                        |


### Engine训练控制

Engine训练设置完成模型训练/验证/推理等过程中的参数设置，是PaddleFleetX AutoEngine的必要参数，所有使用该Engine都必须指定该配置。 其中包含的参数有：

```yaml
  Engine:
    max_steps: 500000
    num_train_epochs: 1
    eval_freq: 1
    eval_iters: 10
    test_iters:
    mix_precision:
      enable: True
      dtype: "float16"
      level: "o2"
      scale_loss: 32768.0
      custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
      custom_white_list: ["lookup_table", "lookup_table_v2"]
    save_load:
      output_dir: ./output
      ckpt_dir:
```

其中参数对应的释义如下：

| **参数名**         | **参数释义**                              |
|-------------------|------------------------------------------|
| max_steps         | 最大训练步数                               |
| num_train_epochs  | 训练的epoch数量                            |
| logging_freq      | 训练日志打印的频率                          |
| eval_freq         | 模型评估间隔，以epoch为粒度                  |
| eval_iters        | 模型评估时训练评估测试集的轮数                |
| test_iters        | 模型测试或推理时的轮数                       |
| enable            | 是否使用混合精度的类型，可选: `True` `False`  |
| dtype             | 使用混合精度的类型，可选: `float16` `bfloat16`|
| level             | 使用混合精度训练的等级，可选 `o1` `o2` `o3`   |
| scale_loss        | 使用混合精度float16下，loss的放缩比例         |
| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16/bfloat16计算时会被认为是数值危险的，它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算。 |
| custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16/bfloat16计算时会被认为是数值安全的，并且对性能至关重要。如果设置了白名单，该名单中的算子会使用float16/bfloat16计算。|
| output_dir        | 指定输出文件                              |
| ckpt_dir          | checkpoint的加载目录                      |


### 模型网络

网络部分完成了网络的组网操作，GPT在[PaddleFleetX/ppfleetx/models/language_model/gpt/auto/auto_model.py]((https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/auto/auto_model.py))下。 
可以使用配置文件配置模型的规模，如：

```yaml
  Model:
    module: "GPTModuleAuto"
    name: "GPT"
    vocab_size: 50304
    hidden_size: 1024
    num_layers: 24
    num_attention_heads: 16
    ffn_hidden_size:
    hidden_dropout_prob: 0.1
    attention_probs_dropout_prob: 0.1
    max_position_embeddings: 1024
    type_vocab_size: 16
    initializer_range: 0.02
    use_recompute: True
    fuse_attn_qkv: True
```

其中参数对应的释义如下：
| **参数名**                    | **参数释义**               |
|------------------------------|------------------------|
| module | 指定GPT模型的执行模块  |
| vocab_size                   | 训练词表大小                 |
| hidden_size                  | 隐藏层大小                  |
| num_layers                   | transformer层数          |
| num_attention_heads          | attention head的数量      |
| max_seq_len                  | 输入文本序列的长度              |
| ffn_hidden_size              | ffn层大小，一般为隐藏层的四倍       |
| attention_probs_dropout_prob | attention中的dropout的失活率 |
| max_position_embeddings      | position embedding的长度  |
| type_vocab_size              | 词表类型                   |
| initializer_range            | 参数初始化的范围               |
| use_recompute                | 是否使用recompute训练，重计算全部transformer  |
| fuse_attn_qkv                | 是否对attention层中qkv计算使用fuse代替传统Linear加速训练 |


### 数据集

数据集参数分为“Train”、“Eval”和“Test”三部分，分别对应模型预训练、离线评估、推理等三个模块。

每个模型的配置参数都包含以下内容：

```yaml
  Data:
    Train:
      collate_fn: gpt_collate_fn
      sample_split: 2
      dataset:
        name: GPTDataset
        input_dir: ./data/
        split: [949, 50, 1]
        max_seq_len: 1024
```

其中参数对应的释义如下：
| **参数名**         | **参数释义**               |
|-------------------|------------------------|
| collate_fn        | 通过此参数指定如果将样本列表组合为mini-batch数据；支持自定义  |
| sample_split      | 通过此参数dataset返回的sample被组织为(inputs,labels) |
| dataset.name      | 指定自定义数据集的名称  |
| input_dir         | 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件 |
| split             | 训练集，验证集和测试集的切分比例 |
| max_seq_len       | 输入文本序列的长度 |


### 优化器

GPT训练默认使用AdamW优化器以及cosine学习率衰减，这里通过配置文件配置优化器的参数，如：

```yaml
  Optimizer:
    name: AdamW
    weight_decay: 0.01
    beta1: 0.9
    beta2: 0.999
    epsilon: 1.0e-8
    lr:
      name: CosineAnnealingWithWarmupDecay
      decay_steps: 360000
      warmup_rate: 0.01
      max_lr: 5.0e-5
      min_lr: 1.0e-5
    grad_clip:
      name: "ClipGradByGlobalNorm"
      clip_norm: 1.0
```

其中参数说明：

| **参数名**      | **参数释义**                |
|----------------|---------------------------|
| name           | 指定自定义优化器的名称        |
| weight_decay   | weight的衰减率              |
| beta1          | 一阶矩估计的指数衰减率        |
| beta2          | 二阶矩估计的指数衰减率        |
| epsilon        | 指定优化器需要优化的参数      |
| lr.name        | 指定自定义学习率策略的名称     |
| decay_steps    | 衰减的步长                  |
| warmup_rate    | warmup 率                  |
| max_lr         | Adam 的初始最大学习率        |
| min_lr         | Adam 的初始最小学习率        |
| grad_clip.name | 指定自定义梯度裁剪策略的名称   |
| clip_norm      | 所允许的范数最大值           |


### 并行维度

当前GPT模型已适配自动并行的**半自动策略**，用户可以通过配置文件选择并行的维度。

```yaml
  Distributed:
    dp_degree: 2
    mp_degree: 2
    pp_degree: 2
    sharding:
      sharding_degree: 1
      sharding_stage: 1
```

其中参数说明：

| **参数名**          | **参数释义**                             |
|------------------|--------------------------------------|
| dp_degree        | 数据并行维度                               |
| mp_degree        | 张量模型并行维度                             |
| pp_degree        | 流水线并行维度                              |
| sharding_degree  | 分组切分并行维度                             |
| sharding_stage   | 切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数 |


## 运行方式
本目录按照345M、1.3B和6.7B规模大小，给出32G V100环境下GPT模型半自动并行训练的策略配置如下：

| 模型规模   | 训练策略                     | yaml文件                               |
|----------|---------------------------- |----------------------------------------|
| 345MB    | 单卡+fp16                    | pretrain_gpt_345M_single_card.yaml     |
| 1.3B     | dp8+fp16+recompute          | pretrain_gpt_1.3B_dp8.yaml             |
| 6.7B     | sharding16+fp16+recompute   | pretrain_gpt_6.7B_sharding16.yaml  |

若要在显存容量更小的16G V100环境下进行GPT大模型训练，可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。

### 策略支持

自动并行包括2种模式：半自动并行与全自动并行。
半自动并行包括了数据并行、张量模型并行、流水线并行和分组切片并行。此外还支持重计算、混合精度等策略，来减少显存占用、加速训练。**目前，GPT 模型训练可以支持任意维度的策略组合。**

|                 | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute |
|-----------------|---------------|-----------------|-------------------|-----------|-----------|
| sharding stage1 | ✓             | ✓               | ✓                 | ✓         | ✓         |
| sharding stage2 | ✓             | ✓               | ✓                 | ✓         | ✓         |
| sharding stage3 | ✓             | ✓               | ✓                 | ✓         | ✓         |


### 单卡训练

以单机1.3B模型训练为例，该gpt程序需要单卡32G V100以运行

**启动命令**
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

export FLAGS_USE_STANDALONE_EXECUTOR=False # 设置执行器环境变量
python ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml
```

### 单机训练

以单机1.3B模型数据并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行。

**启动命令**
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

log_dir=log_auto
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

**启动命令**
```shell
log_dir=log_auto
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py \
    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \
    -o Model.hidden_size=1024
```

每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到；若未指定，日志路径为`log/workerlog.x`。运行日志具体内容如下：

**运行日志**

```
[INFO 2022-08-19 10:47:00,392 engine.py:461] [train] epoch: 0 step: 0 lr: 5.555556e-09 loss: 10.972320
[INFO 2022-08-19 10:47:02,858 engine.py:461] [train] epoch: 0 step: 1 lr: 8.333333e-09 loss: 10.950481
[INFO 2022-08-19 10:47:05,321 engine.py:461] [train] epoch: 0 step: 2 lr: 1.111111e-08 loss: 10.951584
[INFO 2022-08-19 10:47:07,791 engine.py:461] [train] epoch: 0 step: 3 lr: 1.388889e-08 loss: 10.954518
[INFO 2022-08-19 10:47:10,256 engine.py:461] [train] epoch: 0 step: 4 lr: 1.666667e-08 loss: 10.959060
[INFO 2022-08-19 10:47:12,725 engine.py:461] [train] epoch: 0 step: 5 lr: 1.944444e-08 loss: 10.957585
[INFO 2022-08-19 10:47:15,198 engine.py:461] [train] epoch: 0 step: 6 lr: 2.222222e-08 loss: 10.947868
[INFO 2022-08-19 10:47:17,680 engine.py:461] [train] epoch: 0 step: 7 lr: 2.500000e-08 loss: 10.939037
```

### 多机训练

若需要在更多机器上进行大模型训练，则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令（master节点ip为训练所用某一台机器的ip即可）。

以2机16卡32G V100上的6.7B模型分组切分并行训练为例，启动命令为：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_sharding16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型两机训练，也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_sharding16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml \
    -o Model.hidden_size=2048
```


================================================
FILE: projects/gpt/docs/hybrid_parallel.md
================================================
# GPT 混合并行模型训练

当训练超大模型时，就必须借助混合并行策略，混合并行策略分别指数据并行、张量模型并行、流水线并行和分组切片并行。其中数据并行保存完整的模型参数并独立处理一份子数据集，以加速模型训练过程；张量模型并行将网络中的张量（Tensor）切分到不同的设备，从而降低单个设备的显存消耗；流水线并行将模型的不同层放置到不同的计算设备，降低单个计算设备的显存消耗；分组切片并行将参数和模型状态划分到不同卡上，每个GPU只保存部分副本，以减少显存占用。联合四种训练方式，可以实现更大模型、更快训练的效果。具体策略以及相关FleetAPI介绍可以参考以下教程：

- [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html)

- [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html
)
- [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html)

- [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html)


## 参数释义

### 并行维度

当前GPT模型已适配3D混合并行，并能够在训练超大模型，用户可以通过配置文件选择并行的维度。

```yaml
  Distributed:
    dp_degree: 2
    mp_degree: 2
    pp_degree: 2
    sharding:
      sharding_degree: 1
      sharding_stage: 1
      sharding_offload: False
      reduce_overlap: False
      broadcast_overlap: False
```

其中参数说明：

| **参数名**          | **参数释义**                             |
|------------------|--------------------------------------|
| dp_degree        | 数据并行维度                               |
| mp_degree        | 张量模型并行维度                             |
| pp_degree        | 流水线并行维度                              |
| sharding_degree  | 分组切分并行维度                             |
| sharding_stage   | 切分策略；1表示仅切分优化器状态，2表示再切分梯度，3表示再切分前向参数 |
| sharding_offload | CPU offload策略                        |
|reduce_overlap| 是否在sharding stage 2的模式下进行reduce通讯与反向计算的overlap，该策略暂时不支持sharding_offload|
|broadcast_overlap| 是否在sharding stage 2的模式下进行broadcast通讯与下一个batch的 前向计算的overlap，该策略暂时不支持sharding_offload。若使用该模型，在evaluation与save之前，必须调用 `paddle.device.cuda.synchronize()` 方法|

## 运行方式
本目录中按照345M、1.3B、6.7B和175B规模大小，给出32G V100环境下GPT模型混合并行训练的策略配置如下：

| 模型规模 | 训练策略                 | yaml文件                   |
|----------|---------------------------|------------------------------|
| 345M     | fp16+mp8+qat              | qat_gpt_345M_mp8.yaml    |
| 1.3B     | fp16+dp8+recompute        | pretrain_gpt_1.3B_dp8.yaml   |
| 6.7B     | fp16+sharding16+recompute | pretrain_gpt_6.7B_sharding16.yaml  |
| 175B     | fp16+mp8+pp16+recompute   | pretrain_gpt_175B_mp8_pp16.yaml   |

若要在显存容量更小的16G V100环境下进行GPT大模型训练，可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。

### 策略支持

飞桨的混合并行技术包括4个维度：数据并行、张量模型并行、流水线并行和分组切片并行，此外还支持重计算、offload、混合精度、序列并行等策略，来减少显存占用、加速训练。

目前，GPT模型训练已支持前3个维度的任意策略组合，但分组切片并行stage2/3仅支持与数据并行策略组合使用；详见下表。

|                 | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute |
|-----------------|---------------|-----------------|-------------------|-----------|-----------|
| sharding stage1 | ✓             | ✓               | ✓                 | ✓         | ✓         |
| sharding stage2 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |
| sharding stage3 | ✓             | ㄨ               | ㄨ                 | ✓         | ✓         |

### 单机训练

以单机1.3B模型数据并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行。

**启动命令**
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

log_dir=log_dp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    tools/train.py \
    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

**启动命令**
```shell
log_dir=log_dp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    tools/train.py \
    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \
    -o Model.hidden_size=1024
```

每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到；若未指定，日志路径为`log/workerlog.x`。运行日志具体内容如下：

**运行日志**

```
[2022-09-21 05:43:58,797] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.992407799, avg_batch_cost: 5.51734 sec, speed: 0.18 step/s, ips_total: 11878 tokens/s, ips: 1485 tokens/s, learning rate: 2.77778e-08
[2022-09-21 05:43:59,508] [    INFO] - [train] epoch: 0, batch: 1, loss: 11.000075340, avg_batch_cost: 0.71029 sec, speed: 1.41 step/s, ips_total: 92267 tokens/s, ips: 11533 tokens/s, learning rate: 4.16667e-08
[2022-09-21 05:44:00,242] [    INFO] - [train] epoch: 0, batch: 2, loss: 11.017463684, avg_batch_cost: 0.73301 sec, speed: 1.36 step/s, ips_total: 89406 tokens/s, ips: 11176 tokens/s, learning rate: 5.55556e-08
[2022-09-21 05:44:00,965] [    INFO] - [train] epoch: 0, batch: 3, loss: 10.983654976, avg_batch_cost: 0.72319 sec, speed: 1.38 step/s, ips_total: 90620 tokens/s, ips: 11328 tokens/s, learning rate: 6.94444e-08
[2022-09-21 05:44:01,678] [    INFO] - [train] epoch: 0, batch: 4, loss: 11.014451981, avg_batch_cost: 0.71223 sec, speed: 1.40 step/s, ips_total: 92016 tokens/s, ips: 11502 tokens/s, learning rate: 8.33333e-08
[2022-09-21 05:44:02,385] [    INFO] - [train] epoch: 0, batch: 5, loss: 11.005180359, avg_batch_cost: 0.70707 sec, speed: 1.41 step/s, ips_total: 92687 tokens/s, ips: 11586 tokens/s, learning rate: 9.72222e-08
[2022-09-21 05:44:03,100] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.989698410, avg_batch_cost: 0.71402 sec, speed: 1.40 step/s, ips_total: 91785 tokens/s, ips: 11473 tokens/s, learning rate: 1.11111e-07
[2022-09-21 05:44:03,806] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.992337227, avg_batch_cost: 0.70554 sec, speed: 1.42 step/s, ips_total: 92888 tokens/s, ips: 11611 tokens/s, learning rate: 1.25000e-07
[2022-09-21 05:44:04,516] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.972790718, avg_batch_cost: 0.71011 sec, speed: 1.41 step/s, ips_total: 92290 tokens/s, ips: 11536 tokens/s, learning rate: 1.38889e-07
[2022-09-21 05:44:05,228] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.983499527, avg_batch_cost: 0.71128 sec, speed: 1.41 step/s, ips_total: 92138 tokens/s, ips: 11517 tokens/s, learning rate: 1.52778e-07
```

### 多机训练

若需要在更多机器上进行大模型训练，则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令（master节点ip为训练所用某一台机器的ip即可）。

以2机16卡32G V100上的6.7B模型分组切分并行训练为例，启动命令为：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_sharding16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \
    tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型两机训练，也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练，命令如下：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_sharding16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" tools/train.py \
    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \
    -o Model.hidden_size=2048
```

若要执行16机175B大模型混合并行训练，以运行启动命令为：

```shell
master_ip=master节点ip
master_port=可用的空闲端口号

log_dir=log_mp8_pp16
python -m paddle.distributed.launch --log_dir $log_dir \
    --master=$master_ip:$master_port --nnodes=16 --devices "0,1,2,3,4,5,6,7" tools/train.py \
    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml
```

当节点较多时，可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。

### 量化训练


若需要对模型进行量化训练，按照以上在配置文件中添加量化参数，可参考`qat_gpt_345M_mp8.yaml`，量化训练时可以可以适当减少训练轮数和学习率。以单机345M模型模型并行训练为例，通过``paddle.distributed.launch``启动多进程训练，该gpt程序需要8卡32G V100以运行，命令如下：

```shell
log_dir=log_mp8
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" tools/train.py \
    -c ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml
    -o Engine.max_steps=100000 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 
```


# GPT Zero-shot 文本生成

## 参数释义

```yaml
Generation:
  top_k: 50
  top_p: 0.75
  temperature: 1.0
  min_dec_len: 1
  max_dec_len: 200
  num_return_sequences: 1
  decode_strategy: "sampling"
```

其中参数说明：

| **参数名**      | **参数释义**                  |
|--------------|---------------------------|
| top_k | 每次为采样挑选保留分数最高的 k 个 token        |
| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |
| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |
| min_dec_len | 最小生成 token 长度              |
| max_dec_len  | 最大生成 token 长度                     |
| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |
| decode_strategy       | 解码策略，默认值为 "sampling"，目前只支持 "sampling"，未来会支持 "greedy_search"，"beam_search" |

## 文本生成

下载预训练好的模型，快速体验文本生成

```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

# --devices 根据并行策略设置设备

python -m paddle.distributed.launch --devices "0" tasks/gpt/generation.py \
    -c ppfleetx/configs/nlp/gpt/generation_gpt_345M_dp8.yaml \
    -o Engine.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/

# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容

Prompt: Hi, GPT2. Tell me who Jack Ma is.
Generation: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”

For now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.

Jack Ma on why he never wanted to run for President in 2016:

There were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.

On how Alibaba will evolve into a new player in China’s transportation and logistics sector:

I think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.
```

### 剖析体验文本生成

#### GPT 文本生成模块初始化

```python
    module = build_module(cfg)
    module.model.eval()
```

#### 预训练模型加载

```python
    # 获取到预训练 checkpoint 的根目录
    ckpt_dir = cfg.Engine.save_load.ckpt_dir

    # 构造出具体路径
    model_path = os.path.join(ckpt_dir, "model.pdparams")

    # 加载模型参数
    model_dict = paddle.load(model_path)

    # FP16 模型参数转成 FP32 模型参数
    for key, value in model_dict.items():
        model_dict[key] = model_dict[key].astype(paddle.float32)

    # 设置模型参数为预训练参数
    module.model.set_state_dict(model_dict)
```

#### 文本生成与结果展示

```python
    input_text = "Historical Records: Tell us about the history of the Great Wall."
    result = module.generate(input_text)

    print(f'Prompt: {input_text}')
    print(f'Generation: {result[0]}')
```


================================================
FILE: projects/gpt/docs/hybrid_profiler.md
================================================
# Profiler

本文档主要包括在 GPT 中开启 Profiler 并分析调试分析结果的方法，在模型开发中使用 Profiler 分析工具的方法请参考[教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)和[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/profiler/Profiler_cn.html)。

## 参数配置

使用 Profiler 功能需要在任务配置文件中添加 Profiler 配置信息并确保字段为 `enable: True` 以开启分析器。

完整的可配置参数如下所示，可以根据使用场景调整配置。

```
Profiler:
  enable: True
  scheduler: [1, 5]
  profiler_log: log_path
  detailed: True
  record_shapes: True
  profile_memory: True
  summary:
    overview: True
    device: True
    model: True
    dist: True
    kernel: True
    op: True
    mem: True
    memcpy: True
```

其中参数说明：

| **参数名**                      | **参数释义**               |  **默认值** |
|------------------------------|------------------------|------------------------|
|  enable |   是否开启 Profiler | False |
|  scheduler  | 定义分析区间，如 [1, 5] 记录 step 1 到 step 4 的分析数据 | None |
|  profiler_log  | 日志文件目录 |   profiler_log |
|  detailed  | 是否显示详细信息 |   False |
|  record_shapes  |   是否记录 tensor shape 相关信息 | True |
|  profile_memory |   是否统计 memory 相关信息 | True |

其中，当 detailed=True 时会打印所有 summary 表格数据，当 detailed=False 时用户可以根据以下说明定制需要展示的表格信息。

| **参数名**                      | **参数释义**               |  **默认值** |
|------------------------------|------------------------|------------------------|
|  summary.overview | 显示每种类型的 Event 时间消耗 |  True |
|  summary.device | 显示 CPU 和 GPU 的平均利用率信息 |  False |
|  summary.model  | 显示模型 dataloader、forward、backward、optimization 时间消耗 |  True |
|  summary.dist  | 显示计算、通信以及重叠时间 |  False |
|  summary.kernel  | 显示 GPU 执行的 kernel 信息 |  True |
|  summary.op  | 显示框架中算子 (op) 的执行信息 |  True |
|  summary.mem  | 显示内存/显存占用统计信息 |  False |
|  summary.memcpy  | 显示框架中调用内存操作所花费的时间 | False |

## 运行分析

本节以 gpt混合并行 为例，首先进入目录，

```
cd PaddleFleetX
```


修改`ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml` 中 Profiler.enable 为 True, 同时可以根据上节说明调整相关配置，或者使用命令行参数覆盖，例如可以使用以下命令运行程序，
```
python -m paddle.distributed.launch \
    ./tools/train.py -c \
    ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml -o Profiler.enable=True

```

> 在使用 Profiler 工具进行性能分析时，建议减少 train 的步数，获得分析数据即可停止训练。

## 结果分析

在训练结束后会有以下数据：

* 根据配置信息在控制台打印 summary 表格
* 在配置的 `profiler_log` 目录保存 profiler json 文件

这里保存的 json 文件可以通过如下两种方式查看：

* 在 chrome 浏览器中打开 chrome://tracing/，然后打开 json 文件查看
* 根据控制台信息安装并启动 `visualdl --logdir log_path` 然后根据提示在浏览器中**性能分析**模块查看

具体的信息含义解释以及分析方法请参考[文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)。

> 在使用 visualdl 时，如果 log 文件数据较大，启动会比较耗时，请耐心等待。

## 附录

控制台打印的 summary 信息示例如下所示。

**Overview Summary**
```
---------------------------------------------Overview Summary---------------------------------------------
Time unit: ms
-------------------------  -------------------------  -------------------------  -------------------------
Event Type                 Calls                      CPU Time                   Ratio (%)
-------------------------  -------------------------  -------------------------  -------------------------
ProfileStep                4                          18591.04                   100.00
  CudaRuntime              87527                      8555.11                    46.02
  Operator                 21912                      1883.11                    10.13
  UserDefined              13116                      1841.33                    9.90
  OperatorInner            33668                      1018.39                    5.48
  Forward                  8                          731.46                     3.93
  Backward                 4                          671.82                     3.61
  Optimization             4                          315.91                     1.70
  Dataloader               4                          1.37                       0.01
-------------------------  -------------------------  -------------------------  -------------------------
                           Calls                      GPU Time                   Ratio (%)
-------------------------  -------------------------  -------------------------  -------------------------
  Kernel                   16092                      4924.90                    26.49
  Memcpy                   4278                       3617.26                    19.46
  Memset                   780                        2.31                       0.01
  Communication            192                        2363.13                    12.71
-------------------------  -------------------------  -------------------------  -------------------------
```

**Model Summary**

```
-----------------------------------------------------Model Summary-----------------------------------------------------
Time unit: ms
---------------  ------  -----------------------------------------------  ---------------------------------------------  
Name             Calls   CPU Total / Avg / Max / Min / Ratio(%)           GPU Total / Avg / Max / Min / Ratio(%)         
---------------  ------  -----------------------------------------------  ---------------------------------------------  
ProfileStep      4       18591.04 / 4647.76 / 14114.47 / 757.27 / 100.00  4924.90 / 1231.22 / 2853.61 / 682.04 / 100.00  
  Dataloader     4       1.37 / 0.34 / 0.85 / 0.16 / 0.01                 0.00 / 0.00 / 0.00 / 0.00 / 0.00               
  Forward        8       731.46 / 91.43 / 133.28 / 49.03 / 3.93           714.83 / 89.35 / 174.91 / 4.72 / 14.51         
  Backward       4       671.82 / 167.96 / 168.29 / 167.52 / 3.61         1701.53 / 425.38 / 426.97 / 424.10 / 34.55     
  Optimization   4       315.91 / 78.98 / 89.07 / 73.78 / 1.70            108.27 / 27.07 / 27.09 / 27.06 / 2.20          
  Others         -       16870.48 / - / - / - / 90.75                     2400.27 / - / - / - / 48.74                    
---------------  ------  -----------------------------------------------  ---------------------------------------------  
```

**Operator Summary**

```
----------------------------------------------------------------Operator Summary-----------------------------------------------------------------
Time unit: ms
----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------
Name                                                  Calls   CPU Total / Avg / Max / Min / Ratio(%)     GPU Total / Avg / Max / Min / Ratio(%)
----------------------------------------------------  ------  -----------------------------------------  ----------------------------------------
-----------------------------------------------------------Thread: All threads merged------------------------------------------------------------
GradNodePyLayer_RecomputeFunction_backward            96      663.37 / 6.91 / 17.17 / 4.01 / 18.56       1629.87 / 16.98 / 17.41 / 16.69 / 26.98
  TransformerDecoderLayer                             96      262.68 / 2.74 / 5.91 / 1.90 / 39.60        661.18 / 6.89 / 7.11 / 6.73 / 40.57
  backward                                            96      318.62 / 3.32 / 10.57 / 1.31 / 48.03       968.69 / 10.09 / 10.31 / 9.91 / 59.43
matmul dygraph                                        2312    200.13 / 0.09 / 1.61 / 0.04 / 5.60         1487.76 / 0.64 / 9.81 / 0.22 / 24.63
  matmul infer_meta                                   964     1.42 / 0.00 / 0.01 / 0.00 / 0.71           0.00 / 0.00 / 0.00 / 0.00 / 0.00
  matmul compute                                      964     71.38 / 0.07 / 1.59 / 0.03 / 35.67         644.02 / 0.67 / 9.81 / 0.22 / 43.29
    MEMSET                                            192     - / - / - / - / -                          0.42 / 0.00 / 0.00 / 0.00 / 0.07
    volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn      384     - / - / - / - / -                          199.35 / 0.52 / 0.83 / 0.22 / 30.95
    volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn      384     - / - / - / - / -                          263.96 / 0.69 / 0.79 / 0.59 / 40.99
    volta_h884gemm_64x128_ldg8_nn                     192     - / - / - / - / -                          141.13 / 0.74 / 0.92 / 0.61 / 21.91
    void cutlass::Kernel<cutlass_70_tensorop_f16_...  4       - / - / - / - / -                          39.15 / 9.79 / 9.81 / 9.78 / 6.08
  matmul node_creation                                676     2.05 / 0.00 / 0.03 / 0.00 / 1.02           0.00 / 0.00 / 0.00 / 0.00 / 0.00
...
```

**Kernel Summary**
```
---------------------------------------------------------------Kernel Summary---------------------------------------------------------------
Time unit: ms
------------------------------------------------------------------------------------------  ------  ----------------------------------------
Name                                                                                        Calls   GPU Total / Avg / Max / Min / Ratio(%)
------------------------------------------------------------------------------------------  ------  ----------------------------------------
ncclKernel_AllReduce_RING_LL_Sum_half(ncclWorkElem)                                         96      2360.57 / 24.59 / 2202.54 / 0.46 / 47.93
volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn                                                384     263.96 / 0.69 / 0.79 / 0.59 / 5.36
volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn                                    384     241.74 / 0.63 / 0.84 / 0.22 / 4.91
void paddle::operators::VectorizedRandomGenerator<phi::dtype::float16, unsigned char>       580     209.08 / 0.36 / 0.97 / 0.06 / 4.25
volta_h884gemm_64x128_ldg8_nn                                                               288     203.89 / 0.71 / 0.92 / 0.57 / 4.14
volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn                                                384     199.35 / 0.52 / 0.83 / 0.22 / 4.05
volta_h884gemm_256x64_ldg8_tn                                                               288     149.52 / 0.52 / 0.54 / 0.45 / 3.04
void phi::funcs::VectorizedBroadcastKernel<phi::dtype::float16, phi::dtype::float16, ph...  1352    123.12 / 0.09 / 0.40 / 0.05 / 2.50
void paddle::operators::SoftmaxMaskFuseUpperTriangleGPUKernel<phi::dtype::float16, 10>      192     122.37 / 0.64 / 0.66 / 0.60 / 2.48
void cutlass::Kernel<cutlass_70_tensorop_f16_s884gemm_f16_256x128_nt_align8>                100     103.07 / 1.03 / 8.08 / 0.73 / 2.09
void phi::funcs::VectorizedElementwiseKernel<phi::dtype::float16, paddle::operators::Cu...  292     90.80 / 0.31 / 0.83 / 0.06 / 1.84
volta_h884gemm_64x128_ldg8_nt                                                               192     79.76 / 0.42 / 0.43 / 0.40 / 1.62
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eige...  576     75.36 / 0.13 / 0.20 / 0.07 / 1.53
...
```


================================================
FILE: projects/gpt/docs/inference.md
================================================

# 推理部署

模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。

## 1. 模型导出

### 1.1 非量化模型导出

以`GPT-3(345M)`模型为例，通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重，可跳过此步。

```bash
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_FP16.tar.gz
tar -zxvf GPT_345M_FP16.tar.gz
```

通过如下方式进行推理模型导出
导出单卡`GPT-3(345M)`模型：
```bash
sh projects/gpt/auto_export_gpt_345M_single_card.sh
```

导出单卡`GPT-3(6.7B)`模型：
```bash
sh projects/gpt/auto_export_gpt_6.7B_mp1.sh
```

导出8卡`GPT-3(175B)`模型：
```bash
sh projects/gpt/auto_export_gpt_175B_mp8.sh
```

### 1.2 量化模型导出

导出单卡`GPT-3(345M)`量化模型：

```shell
# 为了方便快速体验，这里给出345M量化训练的模型，若已有量化模型，则无需下载
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar
tar xf GPT_345M_QAT_wo_analysis.tar

export CUDA_VISIBLE_DEVICES=0
python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'
```

导出单卡`GPT-3(6.7B)`量化模型：

```shell
export CUDA_VISIBLE_DEVICES=0
python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_6.7B_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0
```

## 2. 推理部署

模型导出后，可通过`tasks/gpt/inference.py`脚本进行推理部署。

单卡推理
```bash
bash projects/gpt/inference_gpt_single_card.sh
```

多卡推理(以8卡为例)

```bash
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export MP=8
bash projects/gpt/inference_gpt_multigpu.sh
```


## 3. Benchmark
- 导出模型
修改配置文件
PaddleFleetX/ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml，将`Generation/early_finish`选项设置为False(关闭提前终止，仅适用于测速场景)

执行导出
```bash
sh projects/gpt/auto_export_gpt_6.7B_mp1.sh
```
如果打开了topp_sampling,则需要安装自定义算子：
```bash
cd ppfleetx/ops && python setup_cuda.py install && cd ../..
```

- 运行benchmark脚本
```
bash projects/gpt/run_benchmark.sh
```

| 模型          | 输入长度 | 输出长度 | batch size | GPU卡数 | FP16推理时延 | INT8推理时延 |
| :------------ | :------: | :------: | :--------: | :-----: | :----------: | :----------: |
| GPT-3(345M)   |    128   |    8     |     1      |    1    |   18.91ms    |   18.30ms    |
| GPT-3(345M)   |    128   |    8     |     2      |    1    |   20.01ms    |   18.88ms    |
| GPT-3(345M)   |    128   |    8     |     4      |    1    |   20.83ms    |   20.77ms    |
| GPT-3(345M)   |    128   |    8     |     8      |    1    |   24.06ms    |   23.90ms    |
| GPT-3(345M)   |    128   |    8     |    16      |    1    |   29.32ms    |   27.95ms    |
| GPT-3(6.7B)   |    128   |    8     |     1      |    1    |   84.93ms    |   63.96ms    |
| GPT-3(6.7B)   |    128   |    8     |     2      |    1    |   91.93ms    |   67.25ms    |
| GPT-3(6.7B)   |    128   |    8     |     4      |    1    |   105.50ms   |   78.98ms    |
| GPT-3(6.7B)   |    128   |    8     |     8      |    1    |   138.56ms   |   99.54ms    |
| GPT-3(6.7B)   |    128   |    8     |    16      |    1    |   204.33ms   |   140.97ms   |
| GPT-3(175B)   |    128   |    8     |     1      |    8    |   327.26ms   |   230.11ms   |
| GPT-3(175B)   |    128   |    8     |     2      |    8    |   358.61ms   |   244.23ms   |
| GPT-3(175B)   |    128   |    8     |     4      |    8    |   428.93ms   |   278.63ms   |
| GPT-3(175B)   |    128   |    8     |     8      |    8    |   554.28ms   |   344.00ms   |
| GPT-3(175B)   |    128   |    8     |    16      |    8    |   785.92ms   |   475.19ms   |

以上性能数据基于PaddlePaddle[每日版本](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-develop) ，依赖CUDA 11.6测试环境。


================================================
FILE: projects/gpt/docs/quantization_aware_training.md
================================================

# GPT模型量化训练

本项目对语言模型 GPT 进行量化训练。目前，PaddleFleetX 提供了 [GPT-345M量化模型](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar) 的预训练模型文件；基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)，采用 ACC(accuracy) 指标后的评估结果如下：

| **模型文件** | **数据类型** | **ACC** |
|---------|-----------|---------------|
| GPT-345M | FP16 |  44.17%  |
| GPT-345M | INT8 |  44.94%  |

下面是本例涉及的文件及说明：

```text
.
├── qat_gpt_345M_single_card.sh            # 单卡345M模型量化训练入口
├── qat_gpt_345M_mp8.sh                    # 8卡345M模型模型并行量化训练入口
├── qat_gpt_6.7B_sharding16.sh             # 16卡6.7B模型分组切片并行量化训练入口
├── eval_qat_gpt_345M_single_card.sh       # 单卡345M量化模型验证入口
├── export_qat_gpt_345M_single_card.sh     # 单卡345M量化模型导出入口

```


### 环境依赖和数据准备
环境依赖和数据准备请参考[GPT文档](./README.md)。

另外，模型导出还依赖于`ppfleetx-ops`的安装

```
cd PaddleFleetX/ # 如果已在此目录下，则忽略
cd ppfleetx/ops && python setup_cuda.py install && cd ../..
```

### 预训练模型准备
量化训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。

**预训练模型下载命令**
```shell
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar xf GPT_345M.tar.gz
```

### 量化训练

- [345M模型单卡训练](../qat_gpt_345M_single_card.sh)

快速启动：
```shell
bash ./projects/gpt/qat_gpt_345M_single_card.sh
```

或如下启动：
```shell
export CUDA_VISIBLE_DEVICES=0

log_dir=log_hybrid
rm -rf $log_dir

python ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml \
    -o Engine.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'
    
```

- [345M模型模型并行训练](../qat_gpt_345M_mp8.sh)

快速启动：
```shell
bash ./projects/gpt/qat_gpt_345M_mp8.sh
```

或如下启动：
```shell
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

log_dir=log_hybrid
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml \
    -o Engine.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'
```

Tips：尽管设置的最大训练轮数为100000轮，但实验经验4000轮即可达到最优效果。


### 量化训练精度调优
针对生成式预训练语言模型的模型压缩一直是学界上的难点，潜在的原因目前并不清楚。经我们研究分析发现，生成式预训练语言模型的Transformer层的权重分布差异较大，且由于生成式预训练语言模型的从左到右预测的性质，量化误差会逐步累积，精度损失较大。为了保证量化模型的精度，PaddleSlim提供量化训练敏感度分析工具，可以有效定位模型某层带来的量化损失较大，以规避一些敏感层并提高量化模型精度。

PaddleSlim中的量化训练敏感度分析工具仅支持静态图模型，需要将量化模型导出为静态图模型。导出命令为：

```shell
# 下载未经过分析的量化模型
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar
tar xf GPT_345M_QAT_wo_analysis.tar

export CUDA_VISIBLE_DEVICES=0

python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'
```
注意：此处导出的并非GenerationModule，而是可用于验证的GPTModule。

具体步骤可参考
[GPT量化训练敏感度分析示例](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/example/quantization_analysis/GPT/README.md)。


### 模型验证
```shell
# 下载验证数据
wget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl

# 下载已经训练好的量化模型
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_w_analysis.tar
tar xf GPT_345M_QAT_w_analysis.tar

export CUDA_VISIBLE_DEVICES=0
python ./tools/eval.py \
    -c ./ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis' \
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True 
```

### 模型导出
```shell
# 下载已经训练好的量化模型，若已有量化模型，不需要下载
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_QAT_wo_analysis.tar
tar xf GPT_345M_QAT_wo_analysis.tar

export CUDA_VISIBLE_DEVICES=0
python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'
```


================================================
FILE: projects/gpt/docs/single_card.md
================================================
# GPT 单卡模型训练

## 运行方式

本文档按照345M和1.3B规模大小，给出32G V100环境下GPT模型单卡训练的策略配置如下：

| 模型规模 | 训练策略       | yaml文件                    | 显存占用 |
|----------|----------------|-------------------------------|----------|
| 345M     | fp16           | pretrain_gpt_345M_single_card.yaml | 30.9GB   |
| 1.3B     | fp16+recompute | pretrain_gpt_1.3B_single_card.yaml | 26.0GB   |

**启动命令**
```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

# 345M
python tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml

# 1.3B
python tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml
```

若要在显存容量更小的16G V100环境下进行GPT模型单机训练，可通过减小`Model.hidden_size`调整模型规模至合适大小，或使用重计算等显存优化策略再启动训练，命令如下：

```shell
# 345M
python tools/train.py \
    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml \
    -o Model.use_recompute=True

# 1.3B
python tools/train.py \
    -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml \
    -o Model.hidden_size=1024
```

**运行日志**

```
[2022-09-21 05:45:27,009] [    INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.53083 sec, speed: 0.40 step/s, ips_total: 3237 tokens/s, ips: 3237 tokens/s, learning rate: 2.77778e-08
[2022-09-21 05:45:27,518] [    INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.50907 sec, speed: 1.96 step/s, ips_total: 16092 tokens/s, ips: 16092 tokens/s, learning rate: 4.16667e-08
[2022-09-21 05:45:28,021] [    INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50265 sec, speed: 1.99 step/s, ips_total: 16298 tokens/s, ips: 16298 tokens/s, learning rate: 5.55556e-08
[2022-09-21 05:45:28,526] [    INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50378 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 6.94444e-08
[2022-09-21 05:45:29,029] [    INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50237 sec, speed: 1.99 step/s, ips_total: 16307 tokens/s, ips: 16307 tokens/s, learning rate: 8.33333e-08
[2022-09-21 05:45:29,531] [    INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50179 sec, speed: 1.99 step/s, ips_total: 16326 tokens/s, ips: 16326 tokens/s, learning rate: 9.72222e-08
[2022-09-21 05:45:30,035] [    INFO] - [train] epoch: 0, batch: 6, loss: 10.988540649, avg_batch_cost: 0.50379 sec, speed: 1.98 step/s, ips_total: 16261 tokens/s, ips: 16261 tokens/s, learning rate: 1.11111e-07
[2022-09-21 05:45:30,540] [    INFO] - [train] epoch: 0, batch: 7, loss: 10.966930389, avg_batch_cost: 0.50387 sec, speed: 1.98 step/s, ips_total: 16258 tokens/s, ips: 16258 tokens/s, learning rate: 1.25000e-07
[2022-09-21 05:45:31,044] [    INFO] - [train] epoch: 0, batch: 8, loss: 10.980175018, avg_batch_cost: 0.50365 sec, speed: 1.99 step/s, ips_total: 16265 tokens/s, ips: 16265 tokens/s, learning rate: 1.38889e-07
[2022-09-21 05:45:31,562] [    INFO] - [train] epoch: 0, batch: 9, loss: 10.966150284, avg_batch_cost: 0.51796 sec, speed: 1.93 step/s, ips_total: 15816 tokens/s, ips: 15816 tokens/s, learning rate: 1.52778e-07
```


# GPT 单卡模型评估

我们提供了对[WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)、[LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl)两种数据集的评估脚本，其中数据集WikiText采用的是PPL(perplexity)评估指标，LAMBADA采用的是ACC(accuracy)指标。

## 参数释义

请在模型评估前将前述数据集下载到FleetX根目录下(WikiText数据集需要解压缩)，然后可以使用配置文件配置评估相关的参数，包括：

```yaml
  Offline_Eval:
    eval_path: ./wikitext-103/wiki.valid.tokens
    cloze_eval: False
    overlapping_eval: 32
    batch_size: 8
    max_seq_len: 1024
    logging_freq: 10
```

其中参数对应的释义如下：

| **参数名**                      | **参数释义**          |
|------------------------------|------------------------|
| eval_path         | 评估数据集地址                      |
| cloze_eval  | lambada数据集参数                     |
| overlapping_eval  | wikitext数据集参数              |
| batch_size         | 模型评估时batch size             |
| max_seq_len        | 模型评估时文本序列长度           |
| logging_freq     | 评估日志的打印频率                |

## 运行方式

以单卡345M模型评估为例，可以使用如下命令启动评估：

### WikiText数据集评估

```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

wget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
unzip -q wikitext-103-v1.zip

ckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/
eval_dir=./wikitext-103

python tools/eval.py -c ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml \
    -o Engine.save_load.ckpt_dir=$ckpt_dir \
    -o Offline_Eval.eval_path=$eval_dir/wiki.valid.tokens \
    -o Offline_Eval.overlapping_eval=32 \
    -o Offline_Eval.batch_size=16
```

评估日志如下：
```shell
[2022-09-21 05:28:26,263] [    INFO] - [eval] epoch: 0, batch: 0, loss: 0.170368048, speed: 0.29 step/s
[2022-09-21 05:28:39,642] [    INFO] - [eval] epoch: 0, batch: 10, loss: 0.231640193, speed: 0.75 step/s
[2022-09-21 05:28:53,469] [    INFO] - [eval] epoch: 0, batch: 20, loss: 0.292417919, speed: 0.72 step/s
[2022-09-21 05:29:07,012] [    INFO] - [eval] epoch: 0, batch: 30, loss: 0.351391476, speed: 0.74 step/s
[2022-09-21 05:29:27,359] [    INFO] - [eval] epoch: 0, batch: 40, loss: 0.415404772, speed: 0.49 step/s
```

评估结果如下：

```shell
[2022-09-21 05:40:32,820] [    INFO] - validation results on ./wikitext-103/wiki.valid.tokens | avg loss: 2.9554E+00 | ppl: 1.9210E+01 | adjusted ppl: 2.4948E+01 | token ratio: 1.0884484081583892
```

### LAMBADA数据集评估

```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

wget -O lambada_test.jsonl https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl

ckpt_dir=ckpt/PaddleFleetX_GPT_345M_220826/

python tools/eval.py -c ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml \
    -o Engine.save_load.ckpt_dir=$ckpt_dir \
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True \
    -o Offline_Eval.batch_size=16

```

评估日志如下：
```shell
[2022-09-21 05:18:24,152] [    INFO] - [eval] epoch: 0, batch: 0, number correct: 50.000000000, speed: 0.29 step/s
[2022-09-21 05:18:37,264] [    INFO] - [eval] epoch: 0, batch: 10, number correct: 130.000000000, speed: 0.76 step/s
[2022-09-21 05:18:50,408] [    INFO] - [eval] epoch: 0, batch: 20, number correct: 209.000000000, speed: 0.76 step/s
[2022-09-21 05:19:03,578] [    INFO] - [eval] epoch: 0, batch: 30, number correct: 279.000000000, speed: 0.76 step/s
[2022-09-21 05:19:16,760] [    INFO] - [eval] epoch: 0, batch: 40, number correct: 343.000000000, speed: 0.76 step/s
```

评估结果如下：

```shell
[2022-09-21 05:25:28,662] [    INFO] - validation results on ./lambada_test.jsonl | number correct: 2.1240E+03 | total examples: 5.1530E+03 | avg accuracy: 4.1219E-01
```

# GPT Zero-shot 文本生成

## 参数释义

```yaml
  Generation:
    top_k: 50
    top_p: 0.75
    temperature: 1.0
    min_dec_len: 1
    max_dec_len: 200
    num_return_sequences: 1
    decode_strategy: "sampling"
```

其中参数说明：

| **参数名**      | **参数释义**                  |
|--------------|---------------------------|
| top_k | 每次为采样挑选保留分数最高的 k 个 token        |
| top_p   | 如果设置小于 1.0 的小数，则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0        |
| temperature   |  调节下一个 token 的概率温度，logits = logits / temperature，默认值为 1.0           |
| min_dec_len | 最小生成 token 长度              |
| max_dec_len  | 最大生成 token 长度                     |
| num_return_sequences  | 每个输入生成的序列个数，默认值为 1                  |
| decode_strategy       | 解码策略，默认值为 "sampling"，目前只支持 "sampling"，未来会支持 "greedy_search"，"beam_search" |

## 文本生成

下载预训练好的模型，快速体验文本生成

### 快速体验文本生成


```shell
cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下，则忽略

mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/

python tasks/gpt/generation.py \
    -c ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml \
    -o Engine.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/

# 生成的文本，由于 checkpoint 不同，超参不同，随机数不同，您执行可能会生成不一样的内容

Prompt: Hi, GPT2. Tell me who Jack Ma is.
Generation: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.”

For now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba.

Jack Ma on why he never wanted to run for President in 2016:

There were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family.

On how Alibaba will evolve into a new player in China’s transportation and logistics sector:

I think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel.
```

### 剖析体验文本生成

#### GPT 文本生成模块初始化

```python
    module = build_module(cfg)
    module.model.eval()
```

#### 预训练模型加载

```python
    # 获取到预训练 checkpoint 的根目录
    ckpt_dir = cfg.Engine.save_load.ckpt_dir

    # 构造出具体路径
    model_path = os.path.join(ckpt_dir, "model.pdparams")

    # 加载模型参数
    model_dict = paddle.load(model_path)

    # FP16 模型参数转成 FP32 模型参数
    for key, value in model_dict.items():
        model_dict[key] = model_dict[key].astype(paddle.float32)

    # 设置模型参数为预训练参数
    module.model.set_state_dict(model_dict)
```

#### 文本生成与结果展示

```python
    input_text = "Historical Records: Tell us about the history of the Great Wall."
    result = module.generate(input_text)

    print(f'Prompt: {input_text}')
    print(f'Generation: {result[0]}')
```


================================================
FILE: projects/gpt/docs/single_finetune.md
================================================
# GPT2 微调

本教程主要针对于 GLUE (General Language Understanding Evaluation) benchmark 中的数据集进行微调，涉及到分类和回归任务。

## 下载 GPT345M 预训练模型
```
# 如果已经下载可以忽略
mkdir -p ckpt
wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/
```

## 快速体验运行

```
# cd path/to/PaddleFleetX
# bash projects/gpt/finetune_gpt_345M_single_card.sh taskname [split]

# taskname 可选: CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI
# 例如 bash projects/gpt/finetune_gpt_345M_single_card.sh CoLA

# 注：当数据集为 MNLI 时，验证集有两种，分别是 dev_matched 和 dev_mismatched，
# 其他数据集，只有一种验证集，因此不用选择
# 可以通过 bash projects/gpt/finetune_gpt_345M_single_card.sh MNLI dev_matched
# 或者 bash projects/gpt/finetune_gpt_345M_single_card.sh MNLI dev_mismatched
# 进行 finetune 训练

bash projects/gpt/finetune_gpt_345M_single_card.sh SST2
```

## GLUE benchmark 数据集

GLUE benchmark 包含 9 个数据集，分别是 **CoLA**、**SST-2**、**MRPC**、**QQP**、**STS-B**、**MNLI**、**QNLI**、**RTE**、**WNLI**，涉及到 **自然语言推断**，**文本蕴含**，**情感分析**，**语义相似** 等任务，整体可以归位 3 类，分别是单句任务：CoLA、SST-2；相似性：MRPC、QQP、STS-B；释义：MNLI、QNLI、RTE、WNLI。

以下介绍载自 [huggingface](https://huggingface.co/datasets/glue/blob/main/glue.py).

* CoLA: The Corpus of Linguistic Acceptability consists of English acceptability judgments drawn from books and journal articles on linguistic theory. Each example is a sequence of words annotated with whether it is a grammatical English sentence.
* SST-2: The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. We use the two-way (positive/negative) class split, and use only sentence-level labels.
* MRPC: The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.
* QQP: The Quora Question Pairs2 dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.
* STS-B: The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 1 to 5.
* MNLI: The Multi-Genre Natural Language Inference Corpus is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are gathered from ten different sources, including transcribed speech, fiction, and government reports. We use the standard test set, for which we obtained private labels from the authors, and evaluate on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend the SNLI corpus as 550k examples of auxiliary training data.
* QNLI: The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question (written by an annotator). We convert the task into sentence pair classification by forming a pair between each question and each sentence in the corresponding context, and filtering out pairs with low lexical overlap between the question and the context sentence. The task is to determine whether the context sentence contains the answer to the question. This modified version of the original task removes the requirement that the model select the exact answer, but also removes the simplifying assumptions that the answer is always present in the input and that lexical overlap is a reliable cue.
* RTE: The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.
* WNLI: The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices. The examples are manually constructed to foil simple statistical methods: Each one is contingent on contextual information provided by a single word or phrase in the sentence. To convert the problem into sentence pair classification, we construct sentence pairs by replacing the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of new examples derived from fiction books that was shared privately by the authors of the original corpus. While the included training set is balanced between two classes, the test set is imbalanced between them (65% not entailment). Also, due to a data quirk, the development set is adversarial: hypotheses are sometimes shared between training and development examples, so if a model memorizes the training examples, they will predict the wrong label on corresponding development set example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence between a model's score on this task and its score on the unconverted original task. We call converted dataset WNLI (Winograd NLI).


## 微调相关类

### `GPTForSequenceClassification`
在 GPT 模型输出的 logits 基础上，增加一个分类层，并且用正态分布对新增的层参数进行初始化。

```
self.score = nn.Linear(self.gpt.hidden_size, num_classes, bias_attr=False)

from paddle.nn.initializer import Normal
normal_ = Normal(std=self.gpt.initializer_range)
normal_(self.score.weight)
```

### `GPTFinetuneModule`
该类继承自`BasicModule`，负责微调模型的初始化以及逻辑计算的类，需要实现几个重要的函数，下面给出两个具体的示例。 

* `__init__`: 负责初始化 loss 函数以及评测指标函数。
* `get_model`: 负责微调类 `GPTForSequenceClassification`、`GPTTokenizer` 初始化以及预训练模型的加载。

## 超参数
微调训练也需要一套完整的超参数，但是微调涉及的核心超参数并不多。

### Engine

| 参数字段 | 参数含义 |
| ------ | --------|
|run_mode| 运行的模式，需要设置为 epoch 方式|
|num_train_epochs| 需要 finetune 的 epoch 数 |

```
Engine:
  run_mode: epoch
  num_train_epochs: 3 # WNLI 和 MRPC 数据集比较小，因此 `num_train_epochs=5`。
```

### Model

| 参数字段 | 参数含义 |
| ------ | --------|
|module| 需要设置为 "GPTFinetuneModule" |
|name | 需要设置为 "GPT" |
|num_classes | finetune 时的类别数，根据语料库以及任务来设定 |
|pretrained | 预训练的权重文件路径前缀，去掉 ".pdparams" |
|loss.train.name | finetune 时的训练损失函数类名 |
|loss.eval.name | finetune 时的验证损失函数类名 |
|metric.eval.name | finetune 时的验证评估函数类名 |

微调时，不同任务对应的类别数 和 loss 函数以及评测指标不同，因此需要通过配置来改变设置。
```
Model:
  module: "GPTFinetuneModule"
  name: "GPT"
  num_classes: 2 # 1 or 2 or 3
  pretrained: 'path/to/pretrained_model'
  
  loss:
    train:
      name: 'CrossEntropyLoss'
    eval:
      name: 'CrossEntropyLoss'
  
  metric:
    eval:
      name: 'Accuracy'
```

### Optimizer 和 LRScheduler

| 参数字段 | 参数含义 |
| ------ | --------|
|name| 优化器类名 |
|weight_decay| 权重衰减值 |
|beta1| FusedAdamW 的 beta1 |
|beta2| FusedAdamW 的 beta2 |
|epsilon| FusedAdamW 的 epsilon |
|multi_precision| 当使用 FP16 O2 级别时，是否开启参数使用多精度表示 |
|tensor_fusion| 是否开启 tensor_fusion |
|lr.name| 学习率调整策略类名 |
|lr.warmup| 当参数时小数时，表示 warmup 步数占总步数的比例，如果是整数时，则表示 warmup 的步数 |
|lr.learning_rate| 初始化学习率值 |

注：这里的超参会跟随优化器类的不同而不同，可以自行查看优化器类和学习率调整策略类初始化函数需要设置的超参数设定。

```
Optimizer:
  name: FusedAdamW
  weight_decay: 0.0
  beta1: 0.9
  beta2: 0.999
  epsilon: 1e-6
  multi_precision: True
  tensor_fusion: False
  lr:
    name: LinearDecayWithWarmup
    warmup: 0.1
    learning_rate: 2e-5
```

### Data

| 参数字段 | 参数含义 |
| ------ | --------|
|Train.dataset| 描述 finetune 时的数据集 |
|Train.sampler| 描述 dataloader 所需要的 batch sampler |
|Train.loader| 描述 dataloader 所需要的相关信息，例如 num_workers 等 |

注：数据集的设定会根据不同任务不同语料库不同而设定不同，例如 `split` 字段，不同数据集是有不同的设定，请参考所需要 finetune 的数据集初始化函数。

```
Data:
  Train:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'train'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: True
      drop_last: True
    loader:
      num_workers: 4
      return_list: False
  
  Eval:
    dataset:
      name: SST2
      root: ./dataset/SST-2/
      split: 'dev'
      max_length: 128
    sampler:
      name: DistributedBatchSampler
      batch_size: 32
      shuffle: False
      drop_last: False
    loader:
      num_workers: 4
      return_list: False
```

## 运行

GLUE benchmark 上的语料库 finetune，大部分设置相同，可以同享一份，只有少量区别处需要改变，因此可以通过超参数的覆盖方式来设置。

数据集加载时会自动判断是否已经缓存下载，如果未缓存下载会自行下载，请保证网络的畅通。当自动下载失败时，可以尝试多次以及检查是否有代理设置等。当下载失败时，也可以自己下载及解压到对应的目录中。

以下是 GLUE benchmark 上的每个语料库的 finetune 单机单卡启动命令：

### CoLA 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=CoLA \
  -o Data.Train.dataset.root=./dataset/cola_public/ \
  -o Data.Eval.dataset.name=CoLA \
  -o Data.Eval.dataset.root=./dataset/cola_public/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.metric.train.name=Mcc \
  -o Model.metric.eval.name=Mcc
  -o Model.num_classes=2
```

### SST2 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=SST2 \
  -o Data.Train.dataset.root=./dataset/SST-2/ \
  -o Data.Eval.dataset.name=SST2 \
  -o Data.Eval.dataset.root=./dataset/SST-2/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```

### MRPC 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Engine.num_train_epochs=5 \
  -o Data.Train.dataset.name=MRPC \
  -o Data.Train.dataset.root=./dataset/MRPC/ \
  -o Data.Eval.dataset.name=MRPC \
  -o Data.Eval.dataset.root=./dataset/MRPC/ \
  -o Data.Eval.dataset.split=test \
  -o Model.num_classes=2 \
  -o Model.metric.train.name=AccuracyAndF1 \
  -o Model.metric.eval.name=AccuracyAndF1
```

### QQP 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=QQP \
  -o Data.Train.dataset.root=./dataset/QQP/ \
  -o Data.Eval.dataset.name=QQP \
  -o Data.Eval.dataset.root=./dataset/QQP/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2 \
  -o Model.metric.train.name=AccuracyAndF1 \
  -o Model.metric.eval.name=AccuracyAndF1
```

### STSB 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=STSB \
  -o Data.Train.dataset.root=./dataset/STS-B/ \
  -o Data.Eval.dataset.name=STSB \
  -o Data.Eval.dataset.root=./dataset/STS-B/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=1 \
  -o Model.metric.train.name=PearsonAndSpearman \
  -o Model.metric.eval.name=PearsonAndSpearman \
  -o Model.loss.train.name=MSELoss \
  -o Model.loss.eval.name=MSELoss
```

### MNLI 数据集

注：MNLI 数据集验证集分为 `dev_matched` 和 `dev_mismatched`，目前暂不支持两个集合同时评测，如果要评测两种验证集，有两种方法：

* 分别 finetune 2次，Data.Eval.dataset.split 设置不同的验证集
* 保存 finetune 后的 checkpoint，在不同验证集上离线评测。


```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=MNLI \
  -o Data.Train.dataset.root=./dataset/multinli_1.0 \
  -o Data.Eval.dataset.name=MNLI \
  -o Data.Eval.dataset.root=./dataset/multinli_1.0 \
  -o Data.Eval.dataset.split=dev_matched \
  -o Model.num_classes=3
```

### QNLI 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=QNLI \
  -o Data.Train.dataset.root=./dataset/QNLI/ \
  -o Data.Eval.dataset.name=QNLI \
  -o Data.Eval.dataset.root=./dataset/QNLI/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```

### RTE 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Data.Train.dataset.name=RTE \
  -o Data.Train.dataset.root=./dataset/RTE/ \
  -o Data.Eval.dataset.name=RTE \
  -o Data.Eval.dataset.root=./dataset/RTE/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```

### WNLI 数据集
```
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
  -o Engine.num_train_epochs=5 \
  -o Data.Train.dataset.name=WNLI \
  -o Data.Train.dataset.root=./dataset/WNLI/ \
  -o Data.Eval.dataset.name=WNLI \
  -o Data.Eval.dataset.root=./dataset/WNLI/ \
  -o Data.Eval.dataset.split=dev \
  -o Model.num_classes=2
```


## 运行结果

以下的指标是通过 [GPT_345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 预训练模型 finetune 得到的结果，仅作为参考。

| Corpus | Task                | Domanin            | Metric                       | Result                       |
| ------ | ------------------- | ------------------ | ---------------------------- | ---------------------------- |
| CoLA   | acceptability       | Misc.              | Matthews corr                | 0.60471                      |
| SST-2  | sentiment           | Movie reviews      | Accuracy                     | 0.93005                      |
| MNLI   | NLI                 | Misc.              | Matched acc./Mismatched acc. | 0.84238/0.84815              |
| QNLI   | QA/NLI              | Wikipedia          | Accuracy                     | 0.90445                      |
| RTE    | NLI                 | News, Wikipedia    | Accuracy                     | 0.70397                      |
| WNLI   | coreference         | Books              | Accuracy                     | 0.40845                      |
| MRPC   | paraphrase          | News               | Accuracy/F1                  | 0.81913/0.87022              |
| QQP    | paraphrase          | social QA question | Accuracy/F1                  | 0.86087/0.81055              |
| STS-B  | sentence similarity | Misc.              | Pearson/Spearman corr.       | 0.85797/0.85824              |


================================================
FILE: projects/gpt/docs/structured_pruning.md
================================================
# GPT模型结构化稀疏

本项目对语言模型 GPT 进行结构化稀疏（以下简称稀疏）。在 GPT 模型中，我们对 fused-qkv、out-linear、ffn1 和 ffn2 四层的权重进行了通道稀疏，其中，fused-qkv 和 ffn1 是在输出通道进行稀疏，out-linear 和 ffn2 是在输入通道进行稀疏。如果您需要自定义稀疏的层和通道，可以通过重写 ppfleetx/utils/compression_helper.py 中的 get_pruned_params() 函数实现。

下面是本例涉及的文件及说明：

```text
.
├── prune_gpt_345M_single_card.sh            # 单卡345M稀疏训练入口
├── eval_prune_gpt_345M_single_card.sh       # 单卡345M稀疏模型验证入口
├── export_prune_gpt_345M_single_card.sh     # 单卡345M稀疏模型导出入口
```


### 环境依赖和数据准备
环境依赖和数据准备请参考[GPT训练文档](./README.md)。

特别的，本示例需要依赖 PaddleSlim develop版本。安装命令如下：

```shell
git clone https://github.com/PaddlePaddle/PaddleSlim.git & cd PaddleSlim
pip install -r requirements.txt
python setup.py install
```


### 预训练模型准备
稀疏训练需加载[GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型。

**预训练模型下载命令**
```shell
wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz
tar xf GPT_345M.tar.gz
```

### 稀疏训练

- [345M模型稀疏训练](../gpt/prune_gpt_345M_single_card.sh)

快速启动：
```shell
bash ./projects/gpt/prune_gpt_345M_single_card.sh
```

或如下启动：
```shell
export CUDA_VISIBLE_DEVICES=0
python ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml \
    -o Engine.max_steps=100000 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.0 \
    -o Optimizer.lr.max_lr=2.5e-5 \
    -o Optimizer.lr.min_lr=5.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'
    
```

### 模型验证
```shell
# 下载验证数据
wget https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl
export CUDA_VISIBLE_DEVICES=0
python ./tools/eval.py \
    -c ./ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./output' \
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True
```

### 模型导出
```shell
export CUDA_VISIBLE_DEVICES=0
python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./output'
```


================================================
FILE: projects/gpt/eval_prune_gpt_345M_single_card.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0

python ./tools/eval.py \
    -c ./ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml


================================================
FILE: projects/gpt/eval_qat_gpt_345M_single_card.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0

python ./tools/eval.py \
    -c ./ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis/'
    -o Offline_Eval.eval_path=./lambada_test.jsonl \
    -o Offline_Eval.cloze_eval=True 


================================================
FILE: projects/gpt/evaluate_gpt_345M_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0
python ./tools/eval.py -c ./ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml


================================================
FILE: projects/gpt/export_gpt_345M_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0
python ./tools/export.py -c ./ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml


================================================
FILE: projects/gpt/export_prune_gpt_345M_single_card.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0

python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml


================================================
FILE: projects/gpt/export_qat_gpt_345M_single_card.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0


# 导出可验证模型
# python ./tools/export.py \
#     -c ./ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml \
#     -o Model.hidden_dropout_prob=0.0 \
#     -o Model.attention_probs_dropout_prob=0.0 \
#     -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_w_analysis/'

# 导出可生成句子模型
python ./tools/export.py \
    -c ./ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Engine.save_load.ckpt_dir='./GPT_345M_QAT_wo_analysis/'


================================================
FILE: projects/gpt/finetune_gpt_345M_single_card.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0

# Single-Sentence Tasks
if [ $1 == "CoLA" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=CoLA \
      -o Data.Train.dataset.root=./dataset/cola_public/ \
      -o Data.Eval.dataset.name=CoLA \
      -o Data.Eval.dataset.root=./dataset/cola_public/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.metric.train.name=Mcc \
      -o Model.metric.eval.name=Mcc \
      -o Model.num_classes=2
elif [ $1 == "SST2" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=SST2 \
      -o Data.Train.dataset.root=./dataset/SST-2/ \
      -o Data.Eval.dataset.name=SST2 \
      -o Data.Eval.dataset.root=./dataset/SST-2/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
# Similarity and Paraphrase Tasks
elif [ $1 == "MRPC" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Engine.num_train_epochs=5 \
      -o Data.Train.dataset.name=MRPC \
      -o Data.Train.dataset.root=./dataset/MRPC/ \
      -o Data.Eval.dataset.name=MRPC \
      -o Data.Eval.dataset.root=./dataset/MRPC/ \
      -o Data.Eval.dataset.split=test \
      -o Model.num_classes=2 \
      -o Model.metric.train.name=AccuracyAndF1 \
      -o Model.metric.eval.name=AccuracyAndF1
elif [ $1 == "QQP" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=QQP \
      -o Data.Train.dataset.root=./dataset/QQP/ \
      -o Data.Eval.dataset.name=QQP \
      -o Data.Eval.dataset.root=./dataset/QQP/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2 \
      -o Model.metric.train.name=AccuracyAndF1 \
      -o Model.metric.eval.name=AccuracyAndF1
elif [ $1 == "STSB" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=STSB \
      -o Data.Train.dataset.root=./dataset/STS-B/ \
      -o Data.Eval.dataset.name=STSB \
      -o Data.Eval.dataset.root=./dataset/STS-B/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=1 \
      -o Model.metric.train.name=PearsonAndSpearman \
      -o Model.metric.eval.name=PearsonAndSpearman \
      -o Model.loss.train.name=MSELoss \
      -o Model.loss.eval.name=MSELoss
# Inference Tasks
elif [ $1 == "MNLI" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=MNLI \
      -o Data.Train.dataset.root=./dataset/multinli_1.0 \
      -o Data.Eval.dataset.name=MNLI \
      -o Data.Eval.dataset.root=./dataset/multinli_1.0 \
      -o Data.Eval.dataset.split=${2:-"dev_matched"} \
      -o Model.num_classes=3
elif [ $1 == "QNLI" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=QNLI \
      -o Data.Train.dataset.root=./dataset/QNLI/ \
      -o Data.Eval.dataset.name=QNLI \
      -o Data.Eval.dataset.root=./dataset/QNLI/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
elif [ $1 == "RTE" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Data.Train.dataset.name=RTE \
      -o Data.Train.dataset.root=./dataset/RTE/ \
      -o Data.Eval.dataset.name=RTE \
      -o Data.Eval.dataset.root=./dataset/RTE/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
elif [ $1 == "WNLI" ]
then
    python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml \
      -o Engine.num_train_epochs=5 \
      -o Data.Train.dataset.name=WNLI \
      -o Data.Train.dataset.root=./dataset/WNLI/ \
      -o Data.Eval.dataset.name=WNLI \
      -o Data.Eval.dataset.root=./dataset/WNLI/ \
      -o Data.Eval.dataset.split=dev \
      -o Model.num_classes=2
else
   echo "Task name not recognized, please input CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI."
fi


================================================
FILE: projects/gpt/inference.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import argparse
import numpy as np

import paddle
import paddle.distributed.fleet as fleet
from ppfleetx.data import build_dataloader, tokenizers
from ppfleetx.core.engine.inference_engine import InferenceEngine
import ppfleetx_ops


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--mp_degree", default=1, type=int, help="")
    parser.add_argument(
        "--model_dir", default="output", type=str, help="model directory")

    args = parser.parse_args()
    return args


def main():

    args = parse_args()

    fleet.init(is_collective=True)
    infer_engine = InferenceEngine(args.model_dir, args.mp_degree)

    tokenizer = tokenizers.GPTTokenizer.from_pretrained("gpt2")
    input_text = 'Hi, GPT2. Tell me where is Beijing?'
    ids = [tokenizer.encode(input_text)]

    # run test

    outs = infer_engine.predict([ids])

    ids = list(outs.values())[0]
    out_ids = [int(x) for x in ids[0]]
    result = tokenizer.decode(out_ids)
    result = input_text + result

    print('Prompt:', input_text)
    print('Generation:', result)


if __name__ == "__main__":
    main()


================================================
FILE: projects/gpt/inference_gpt_6.7B_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_mp1
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir=$log_dir  projects/gpt/inference.py --mp_degree 1 --model_dir output


================================================
FILE: projects/gpt/inference_gpt_multigpu.sh
================================================
#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_mp1
rm -rf $log_dir

export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export MP=8

python -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" projects/gpt/inference.py --mp_degree ${MP} --model_dir output


================================================
FILE: projects/gpt/inference_gpt_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_mp1
rm -rf $log_dir

export CUDA_VISIBLE_DEVICES=0
python -m paddle.distributed.launch --devices "0"  projects/gpt/inference.py --mp_degree 1 --model_dir output


================================================
FILE: projects/gpt/pretrain_gpt_1.3B_dp8.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_hybrid
rm -rf $log_dir

# 1.3B+dp8 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml


================================================
FILE: projects/gpt/pretrain_gpt_1.3B_single_card.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml 


================================================
FILE: projects/gpt/pretrain_gpt_175B_mp8_pp16.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_hybrid
rm -rf $log_dir

# 175B+mp8_pp16 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml


================================================
FILE: projects/gpt/pretrain_gpt_345M_single_card.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0
python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml 


================================================
FILE: projects/gpt/pretrain_gpt_6.7B_sharding16.sh
================================================
#! /bin/bash
# Runs the "1.3B" parameter model
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_hybrid
rm -rf $log_dir

# 6.7B+sharding16 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml


================================================
FILE: projects/gpt/prune_gpt_345M_single_card.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0

python ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml


================================================
FILE: projects/gpt/qat_gpt_345M_mp8.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

log_dir=log_hybrid
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml \
    -o Engine.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'


================================================
FILE: projects/gpt/qat_gpt_345M_single_card.sh
================================================

#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


export CUDA_VISIBLE_DEVICES=0

python ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml \
    -o Engine.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_345M_220826'


================================================
FILE: projects/gpt/qat_gpt_6.7B_sharding16.sh
================================================
#! /bin/bash
# Runs the "1.3B" parameter model
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_hybrid
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/nlp/gpt/qat_gpt_6.7B_sharding16.yaml \
    -o Engine.max_steps=100000 \
    -o Model.hidden_dropout_prob=0.0 \
    -o Model.attention_probs_dropout_prob=0.0 \
    -o Optimizer.lr.decay_steps=72000 \
    -o Optimizer.weight_decay=0.02 \
    -o Optimizer.lr.max_lr=5.0e-6 \
    -o Optimizer.lr.min_lr=1.0e-6 \
    -o Compress.pretrained='./PaddleFleetX_GPT_6.7B'


================================================
FILE: projects/gpt/run_benchmark.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# for mp=8(GPT 175b)
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" projects/gpt/benchmark.py --seq_len 128 --iter 10 --mp_degree 8 --model_dir ./output

# for mp=1(GPT 6.7B & GPT 345M)
export CUDA_VISIBLE_DEVICES=0
python -m paddle.distributed.launch --devices "0" projects/gpt/benchmark.py --seq_len 128 --iter 10 --mp_degree 1 --model_dir ./output


================================================
FILE: projects/imagen/README.md
================================================

<h1>Imagen</h1>
<h3>Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding</h3>


*  Paddle implementation of [Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding](https://arxiv.org/pdf/2205.11487.pdf). Google's Text-to-Image Diffussion Models that beats DALL-E2.


## Updates

***20/September/2022:***  The code of Text-to-image and Super Resolution model is released.


## Introduction
Imagen is a text-to-image diffusion model with an unprecedented degree of photorealism and a deep level of language understanding.Imagen builds on the power of large transformer language models in understanding text and hinges on the strength of diffusion models in high-fidelity image generation.Imagen utilizes a pipeline of a base 64 × 64 model, and two text-conditional super-resolution diffusion models to upsample a 64 × 64 generated image into a 256 × 256 image, and then to 1024 × 1024 image.
<br />  
In comparison to previous text-to-image diffusion generation methods (e.g., DALL-E2) that take advantages of multi-modal embeddings such as CLIP, Imagen benefits largely from the use of large pre-trained language models.

<div align=center><img src="./demo/Imagen_theme.png" width="40%"></div>

## Usage

### Data preparing
Imagen need text-image pairs for the training loop. For scaling purpose, we provide a [demo dataset](https://paddlefleetx.bj.bcebos.com/data/laion400m/part-00079) which textual embeddings and mask is precomputed.
```
cp part-00079 PaddleFleetX/projects/imagen
``` 
### Imagen text encoder preparing
Imagen need load pretrained text encoder model for the training loop. T5 and
DeBERTa V2 are provided for Imagen.
#### T5-11B
``` 
# T5 tokenizer and model was converted from Huggingface.
config.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/config.json
spiece.model: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/spiece.model
tokenizer.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/tokenizer.json
t5 model: wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.0
          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.1
          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.2
          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.3
          wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.4
          cat t5.pd.tar.gz.* |tar -xf - 
put them into t5 folder like this:
PaddleFleetX/projects/imagen/t5
                 ├── t5-11b
                    ├── config.json
                    ├── spiece.model
                    ├── t5.pd
                    └── tokenizer.json
``` 

#### DeBERTa V2 1.5B
```
# DeBERTa V2 tokenizer and model was converted from Huggingface.
config.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/config.json
spm.model: wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/spm.model
tokenizer_config.json: https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/tokenizer_config.json
denerta v2 model: wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.0
                  wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.1
                  cat debertav2.pd.tar.gz.* | tar -xf -
put them into cache folder like this:
PaddleFleetX/projects/imagen/cache
                  └── deberta-v-xxlarge
                      ├── config.json
                      ├── debertav2.pd
                      ├── spm.model
                      ├── tokenizer_config.json
```
### Train Imagen with T5-11B text encoder
```
cd PaddleFleetX/
```
Train Imagen text-to-image 64×64 397M diffusion model with single gpu.
```
sh projects/imagen/run_text2im_397M_64x64_single_card.sh
```
Train Imagen text-to-image 64×64 397M diffusion model with 128 gpus.
 
```
sh projects/imagen/run_text2im_397M_64x64_dp128.sh
```
Train Imagen text-to-image 64×64 2B diffusion model with 256 gpus.
 
- The 2B parameters diffusion model use Group Sharded data parallelism techniques to eliminate memory redundacies by partitioning the optimizer states, gradients, and parameters across multiple devices.

 
```
cd PaddleFleetX/
sh projects/imagen/run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh
```
### Train DeBERTaV2 1.5B Imagen diffusion model with 8 gpus.
```
cd PaddleFleetX/
sh projects/imagen/run_text2im_64x64_DebertaV2_dp8.sh
```
### Train Imagen Super Resolusion 256×256 diffusion model.
Train Imagen Super Resolusion 256×256 diffusion model with single gpu.
```
cd PaddleFleetX/
sh projects/imagen/run_super_resolution_256_single_card.sh
```
Train Imagen Super Resolusion 256×256 diffusion model with 128 gpus.
```
cd PaddleFleetX/
sh projects/imagen/run_super_resolution_256_dp128.sh
```
Train Imagen Super Resolusion 1024×1024 diffusion model with 128 gpus.
- The 1024x1024 super resolution diffusion model use checkpointing techniques to eliminate intermediate variable memory redundacies.
```
cd PaddleFleetX/
sh projects/imagen/run_super_resolution_1024_sharding128.sh
```

## Citing Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding 
```
@article{chen2022context,
  title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},
  author={Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, Mohammad Norouzi},
  journal={arXiv preprint arXiv:2205.11487},
  year={2022}
}
```


================================================
FILE: projects/imagen/filelist/laion_400M/train
================================================
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079
projects/imagen/part-00079


================================================
FILE: projects/imagen/run_super_resolution_1024_sharding128.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_sharding
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml \
    -o Distributed.sharding.sharding_stage=2 \
    -o Distributed.sharding.sharding_degree=8 \
    -o Engine.mix_precision.enable=False \
    -o Data.Train.loader.batch_size=1 \
    -o Model.use_recompute=True \


================================================
FILE: projects/imagen/run_super_resolution_256_dp128.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_sharding
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml \
    -o Distributed.dp_degree=128


================================================
FILE: projects/imagen/run_super_resolution_256_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python3 tools/train.py -c ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml


================================================
FILE: projects/imagen/run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh
================================================
#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_sharding
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/train.py \
    -c ./ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml \
    -o Distributed.sharding.sharding_stage=2 \
    -o Distributed.dp_degree=32 \
    -o Distributed.sharding.sharding_degree=8 


================================================
FILE: projects/imagen/run_text2im_397M_64x64_dp128.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_dp128
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
  tools/train.py \
  -c ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml \
  -o Distributed.dp_degree=128 


================================================
FILE: projects/imagen/run_text2im_397M_64x64_single_card.sh
================================================
#! /bin/bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

python3 tools/train.py -c ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml


================================================
FILE: projects/imagen/run_text2im_64x64_DebertaV2_dp8.sh
================================================
#! /bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


log_dir=log_dp8
rm -rf $log_dir

python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
  tools/train.py \
  -c ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml \
  -o Distributed.dp_degree=8


================================================
FILE: projects/moco/README.md
================================================
# MoCo
![MoCo](https://user-images.githubusercontent.com/11435359/71603927-0ca98d00-2b14-11ea-9fd8-10d984a2de45.png)

This is a PaddlePaddle implementation of the 
[MoCov1](https://arxiv.org/abs/1911.05722), 
[MoCov2](https://arxiv.org/abs/2003.04297).


## Install Preparation

MoCo requires `PaddlePaddle >= 2.4`.
```shell
# git clone https://github.com/PaddlePaddle/PaddleFleetX.git
cd /path/to/PaddleFleetX
```

All commands are executed in the `PaddleFleetX` root directory.

```shell
python -m pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple
```

## Data Preparation

The imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure.

```shell
ILSVRC2012
├── train/
├── xxx
├── val/
└── xxx
```

Then configure the path.

```shell
mkdir -p dataset
ln -s /path/to/ILSVRC2012 dataset/ILSVRC2012
```

## Unsupervised Training

To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu machine, you can run the script: 

### MoCo V1 (Single Node with 8 GPUs)
```shell
export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml
```

### MoCo V2 (Single Node with 8 GPUs)
```shell
export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml
```


The differences between MoCo v1 and MoCo v2 are as follows:
* MoCo v2 has a projector
* Data augmentation
* Softmax temperature
* Learning rate scheduler

## Linear Classification

When the unsupervised pre-training is complete, or directly download the provided pre-training checkpoint, you can use the following script to train a supervised linear classifier.

### MoCo v1

#### [Optional] Download checkpoint
```shell
mkdir -p pretrained/moco/
wget -O ./pretrained/moco/mocov1_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.pdparams
```

#### Linear Classification Training (Single Node with 8 GPUs)

```shell
export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \
    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov1_pt_imagenet2012_resnet50

```

### MoCo v2

#### [Optional] Download checkpoint
```shell
mkdir -p pretrained/moco/
wget -O ./pretrained/moco/mocov2_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams
```

#### Linear Classification Training (Single Node with 8 GPUs)

```shell
export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \
    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov2_pt_imagenet2012_resnet50

```

## Models

| Model   | Phase                 | Epochs | Top1 Acc | Checkpoint                                                   | Log                                                          |
| ------- | --------------------- | ------ | -------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
| MoCo v1 | Unsupervised Training | 200    | -        | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.log) |
| MoCo v1 | Linear Classification | 100    | 0.606141 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_lincls_imagenet2012_resnet50.log) |
| MoCo v2 | Unsupervised Training | 200    | -        | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.log) |
| MoCo v2 | Linear Classification | 100    | 0.676595 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.log) |


## Citations

```
@Article{he2019moco,
  author  = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick},
  title   = {Momentum Contrast for Unsupervised Visual Representation Learning},
  journal = {arXiv preprint arXiv:1911.05722},
  year    = {2019},
}

@Article{chen2020mocov2,
  author  = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He},
  title   = {Improved Baselines with Momentum Contrastive Learning},
  journal = {arXiv preprint arXiv:2003.04297},
  year    = {2020},
}
```


================================================
FILE: projects/moco/run_mocov1_lincls_in1k.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \
    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov1_pt_imagenet2012_resnet50


================================================
FILE: projects/moco/run_mocov1_pretrain_in1k.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml


================================================
FILE: projects/moco/run_mocov2_lincls_in1k.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \
    -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov2_pt_imagenet2012_resnet50


================================================
FILE: projects/moco/run_mocov2_pretrain_in1k.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export PADDLE_NNODES=1
export PADDLE_MASTER="127.0.0.1:12538"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch \
    --nnodes=$PADDLE_NNODES \
    --master=$PADDLE_MASTER \
    --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py -c ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml


================================================
FILE: projects/protein_folding/README.md
================================================
# Protein Folding

声明: 本项目不提供具体能运行的蛋白质结构预测程序，如果想体验直接能运行的蛋白质结构预测代码，请跳转到
[HelixFold](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold) 中运行。


本项目是一个教程，展示如何将数据并行、动态轴并行、分支并行（DP-DAP-BP）混合并行接入到 HelixFold 中。
想要在 HelixFold 中使用混合并行，则涉及到以下几个方面：

* 依赖安装
* 通信初始化
* 混合并行网络模型使用
* 优化器设置 DAP 和 BP 属性
* 参数同步与梯度同步

## 依赖安装
```shell
pip install ppfleetx
```

## 通信初始化

```python
from ppfleetx.distributed.protein_folding import dp
from ppfleetx.distributed.protein_folding.scg import scg

def init_distributed_env(args):
    dp_rank = 0 # ID for current device in distributed data parallel collective communication group
    dp_nranks = 1 # The number of devices in distributed data parallel collective communication group
    if args.distributed:
        # init bp, dap, dp hybrid distributed environment
        scg.init_process_group(parallel_degree=[('dp', None), ('dap', args.dap_degree), ('bp', args.bp_degree)])

        dp_nranks = dp.get_world_size()
        dp_rank = dp.get_rank_in_group() if dp_nranks > 1 else 0

        if args.bp_degree > 1 or args.dap_degree > 1:
            assert args.seed is not None, "BP and DAP should be set seed!"

    return dp_rank, dp_nranks
```

## 混合并行网络模型使用

目前，在 HelixFold 网络模型中涉及到混合并行的有 Embedding 和 Evoformer 类，因此可以将原来 HelixFold 中的 `EmbeddingsAndEvoformer`
修改为 `DistEmbeddingsAndEvoformer`。在网络模型中涉及 `DAP` 和 `BP` 的网络模型修改都在 [DistEmbeddingsAndEvoformer](../../ppfleetx/models/protein_folding/evoformer.py) 中封装，

```python
from ppfleetx.models.protein_folding.evoformer import DistEmbeddingsAndEvoformer 
evoformer = DistEmbeddingsAndEvoformer(
    self.channel_num, self.config.embeddings_and_evoformer,
    self.global_config)
```

## 优化器设置 DAP 和 BP 属性

由于 `DAP` 和 `BP` 在网络模型中分别切分的是中间激活值和网络计算分支，参数是没有切分的，因此在梯度同步的时候，
是需要区分同步的。我们将 `dap` 和 `bp` 属性设置在优化器参数分组中作为区分，并在后续梯度同步的时候使用。

```python
evoformer_params = []
template_and_pair_transition_params = []
other_params = []
for name, p in model.named_parameters():
    if 'template_pair_stack' in name or 'pair_transition' in name:
        template_and_pair_transition_params.append(p)
    elif 'evoformer_iteration' in name or 'extra_msa_stack' in name:
        evoformer_params.append(p)
    else:
        other_params.append(p)
parameters = []

if args.dap_degree > 1 or args.bp_degree > 1:
    parameters.append({'params': get_fused_params(other_params)})
    parameters.append({'params': get_fused_params(evoformer_params), 'dap': True, 'bp': True})
    parameters.append({'params': get_fused_params(template_and_pair_transition_params), 'dap': True})
else:
    parameters.append({'params': get_fused_params(other_params + evoformer_params + template_and_pair_transition_params)})

optimizer = paddle.optimizer.Adam(
        learning_rate=lr_scheduler, 
        epsilon=1e-06,
        grad_clip=grad_clip,
        parameters = parameters
    )
```

## 参数同步与梯度同步

### 参数同步

虽然是 `DP-DAP-BP` 混合并行，但是每个设备上的模型参数是没有切分的，因为在模型训练之前也需要做一次参数同步。

```python
from ppfleetx.distributed.protein_folding import dp

model = RunModel(train_config, model_config)
dp.param_sync(model, src_rank=0)
```

### 梯度同步

如上节所述，在梯度同步的时候需要分别对 `DP`，`DAP`，`BP` 并行策略相关的模型参数的梯度进行同步。

```python
from ppfleetx.distributed.protein_folding import dap, bp, dp

loss.backward()

# sync the gradient for branch parallel firstly
bp.grad_sync(optimizer._param_groups)
# then sync the gradient for dap
dap.grad_sync(optimizer._param_groups)
# finally sync the gradient for ddp
dp.grad_sync(optimizer._param_groups)

optimizer.step()
optimizer.clear_grad()
```

## 论文引用
```
@article{wang2022helixfold,
  title={HelixFold: An Efficient Implementation of AlphaFold2 using PaddlePaddle},
  author={Wang, Guoxia and Fang, Xiaomin and Wu, Zhihua and Liu, Yiqun and Xue, Yang and Xiang, Yingfei and Yu, Dianhai and Wang, Fan and Ma, Yanjun},
  journal={arXiv preprint arXiv:2207.05477},
  year={2022}
}

@article{wang2022efficient_alphafold2,
  title={Efficient AlphaFold2 Training using Parallel Evoformer and Branch Parallelism},
  author={Wang, Guoxia and Wu, Zhihua and Fang, Xiaomin and Xiang, Yingfei and Liu, Yiqun and Yu, Dianhai and Ma, Yanjun},
  journal={arXiv preprint arXiv:2211.00235},
  year={2022}
}
```


================================================
FILE: projects/ufo2.0/README.md
================================================
# VIMER-UFO 2.0 (文心-CV大模型)
## 整体概述
近年来预训练大模型一次次刷新记录，展现出惊人的效果，但对于产业界而言，势必要面对如何应用落地的问题。当前预训练模型的落地流程可被归纳为：针对只有少量标注数据的特定任务，使用任务数据 fine-tune 预训练模型并部署上线。然而，当预训练模型参数量不断增大后，该流程面临两个严峻的挑战。首先，随着模型参数量的急剧增加，大模型 fine-tuning 所需要的计算资源将变得非常巨大，普通开发者通常无法负担。其次，随着 AIoT 的发展，越来越多 AI 应用从云端往边缘设备、端设备迁移，而大模型却无法直接部署在这些存储和算力都极其有限的硬件上。

针对预训练大模型落地所面临的问题，百度提出统一特征表示优化技术（UFO：Unified Feature Optimization），在充分利用大数据和大模型的同时，兼顾大模型落地成本及部署效率。VIMER-UFO 2.0 技术方案的主要内容包括：
  * Task MoE: 飞桨多任务超网络分布式训练架构，支持训练任务动态扩展，特定任务任意切分，保证多任务之间信息有效借鉴，负载均衡，高效协同。
  * All in One：行业最大 170 亿参数视觉多任务模型，覆盖人脸、人体、车辆、商品、食物细粒度分类等 20+ CV 基础任务，单模型 28 个公开测试集效果 SOTA。
  * One for All：首创针对视觉多任务的超网络与训练方案，支持各类任务、各类硬件的灵活部署，解决大模型参数量大，推理性能差的问题。

![图1:UFO整体架构](./img/UFO_v2_1.png)

## 模型效果
文心VIMER-UFO 2.0大模型是基于飞桨的Task MoE架构构建多任务超网络，模型参数量达到170亿，单模型28项公开数据集SOTA。基于飞桨Task MoE架构，可以根据任务的不同自动选择激活最优的区域，从而实现100倍参数压缩，同时支持下游任务快速扩展，是行业最大的视觉多任务统一大模型。尽管 VIMER-UFO 2.0 大模型参数量达到了170 亿，得益于 Task-MoE 稀疏结构，每个任务推理时只需激活部分参数，计算量相当于 6 亿参数模型规模，加速比接近 30 倍。更多细节请参看[VIMER-UFO 2.0](https://github.com/PaddlePaddle/VIMER/tree/develop/UFO)。

![图2:UFO_Result](./img/UFO_v2_2.png)

## 飞桨Task MoE分布式训练架构
如此大的参数规模和任务数，给模型的训练带来了巨大的挑战。文心VIMER-UFO 2.0大模型采用稀疏门控混合专家设计，仅参数存储就需要68G，给训练时的模型存储带来了压力；该模型在前向反向时所有计算节点间会进行同步等待的All-to-All通信，使得通信负担明显加大；此外，该模型的多任务数目是动态的，且多个任务之间样本严重不均衡，使得计算节点之间的同步等待较长，影响并发效率。

针对这些挑战，飞桨提出了Task MoE分布式训练架构，不仅实现多级并行存储稀疏参数，还支持硬件拓扑感知通信，使得层次化All-to-All通信效率提升20%。同时飞桨还创新性地提出了基于Task的负载均衡机制，支持任务数量的动态扩展、特定任务的任意切分以及多个任务在不同的专家下的并发训练，同等实验环境下训练性能比PyTorch提升66%。同时，该方案保障多任务之间信息借鉴机制的有效性，使得VIMER-UFO 2.0模型精度大幅提升。此外，在推理阶段，基于飞桨Task MoE架构构建的多任务多路径的超网络，可支持任务粒度的路径选择，方便灵活部署。

![图3:UFO_Perf](./img/UFO_Perf.png)


## 使用方案
1. 有关UFO的更多细节原理请参看[VIMER-UFO 2.0](https://github.com/PaddlePaddle/VIMER/tree/develop/UFO)。
2. VIMER-UFO 2.0 相关的模型、训练代码和评测脚本均已开源，更多细节正在逐渐完善中，了解详细信息可访问：https://github.com/PaddlePaddle/VIMER/tree/main/UFO/OneForAll


================================================
FILE: projects/vit/README.md
================================================
# Vision Transformer

This project implements the (Vision Transformer) proposed by google [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929).


## How to pretrain from scratch on imagenet2012

### Go to the main repo directory
All commands are executed in the home directory.
```
cd /path/to/PaddleFleetX
```

### Data
The imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure.

```
ILSVRC2012
├── train/
├── train_list.txt
├── val/
└── val_list.txt
```

Then configure the path.

```shell
mkdir -p dataset
ln -s /path/to/ILSVRC2012 dataset/ILSVRC2012
```

### Train ViT-B/16

Note: ViT-B/16 needs run on 2 nodes with 16 A100 GPUs. If you only have a low-memory GPU, you can use gradient accumulation by setting `accumulate_steps` in yaml.


The following commands need to be run on each node.
```shell
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml
```

## Finetune ViT-B/16

### [Optional] Download checkpoint
```shell
mkdir -p pretrained/vit/
wget -O ./pretrained/vit/imagenet2012-ViT-B_16-224.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams
```


### Finetune on imagenet2012
Finetune is similar to pre-training on ImageNet2012 dataset, we have provided the configured yaml file.

```shell
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
```

### Finetune on cifar10

Note: CIFAR10 dataset is automatically downloaded and cached.

```shell
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_cifar10_1n8c_dp_fp16o2.yaml
```

### Quantization Aware Training on ImageNet2012


```shell
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \
    -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \
    -o Model.model.drop_rate=0.0 \
    -o Data.Train.sampler.batch_size=16 \
    -o Optimizer.lr.learning_rate=5e-05 \
    -o Optimizer.weight_decay=0.0002 
```

量化训练的参数详细介绍见[模型压缩介绍](../../../docs/compression.md)。


## Model

| Model    | Phase    | Size   | Dataset      | Resolution | GPUs        | Img/sec | Top1 Acc | Pre-trained checkpoint                                                                             | Fine-tuned checkpoint | Log                                                                                      |
|----------|----------|--------|--------------|------------|-------------|---------|----------|----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------|
| ViT-B_16 | pretrain | 167MiB | ImageNet2012 | 224        | A100*N2C16  | 7350    | 74.75%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | -                                                                                               | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.log) |
| ViT-B_16 | finetune | 167MiB | ImageNet2012 | 384        | A100*N2C16  | 1580    | 77.68%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.pdparams)          | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.log) |
| ViT-L_16 | finetune | 582MiB | ImageNet2012 | 384        | A100*N2C16  | 519     | 85.13%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k-jax-ViT-L_16-224.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k+imagenet2012-ViT-L_16-384.pdparams)          | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k+imagenet2012-ViT-L_16-384.log) |
| Quantized ViT-B_16 | finetune | 167MiB | ImageNet2012 | 384         | A100*N2C16  | 1580     |  77.71%  | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/quantized_imagenet2012-ViT-B_16-384.pdparams)          | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/quantized_imagenet2012-ViT-B_16-384.log) |


# 推理部署

参考[这里](./docs/inference.md)


================================================
FILE: projects/vit/auto_vit_patch16_224_dp8.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

log_dir=log_auto
rm -rf $log_dir

# tiny_patch16_224+dp8 run_pretrain
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
    ./tools/auto.py \
    -c ppfleetx/configs/vis/vit/auto/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml


================================================
FILE: projects/vit/docs/inference.md
================================================
# 推理部署

模型训练完成后，可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。

```bash
sh projects/vit/run_inference_base_patch16_224.sh
```
分解步骤如下：

## 1. 模型导出

首先将模型导出为用于部署的推理模型，可通过`tools/export.py`进行模型导出，通过`-c`指定需要导出的模型的配置文件，通过`-o Engine.save_load.ckpt_dir=`指定导出模型时使用的权重。

以`VIT-224`模型为例，通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重，可跳过此步。

```bash
mkdir -p ckpt
wget -O ckpt/model.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams
```

通过如下方式进行推理模型导出

```bash
python tools/export.py \
    -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml \
    -o Engine.save_load.ckpt_dir=./ckpt/
```

导出的模型默认保存在`./output`目录，可通过配置文件中`Engine.save_load.output_dir`或通过`-o Engine.save_load.output_dir=`指定


## 2. 推理部署

模型导出后，可通过`projects/vit/inference.py`脚本进行推理部署。

```bash
python projects/vit/inference.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml
```

================================================
FILE: projects/vit/export_qat.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/export.py \
    -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \
    -o Model.model.drop_rate=0.0 \
    -o Data.Train.sampler.batch_size=16 \
    -o Optimizer.lr.learning_rate=5e-05 \
    -o Optimizer.weight_decay=0.0002


================================================
FILE: projects/vit/inference.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import numpy as np
from PIL import Image
import paddle

from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))

from ppfleetx.utils import config
from ppfleetx.distributed.apis import env
from ppfleetx.utils.log import logger
from ppfleetx.data import build_dataloader, tokenizers
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine

def softmax(x):
    exp_x = np.exp(x)
    return exp_x/np.sum(exp_x)

def preprocess(img_path):
        """preprocess
        Preprocess to the input.
        Args: img_path: Image path.
        Returns: Input data after preprocess.
        """
        with open(img_path, "rb") as f:
            img = Image.open(f)
            img = img.convert("RGB")
        # ResizeImage
        img = img.resize((224,224), Image.BILINEAR)

        # NormalizeImage
        scale = np.float32(1.0/255.0)
        mean = [0.5, 0.5, 0.5]
        std = [0.5, 0.5, 0.5]
        shape = (1, 1, 3)
        mean = np.array(mean).reshape(shape).astype('float32')
        std = np.array(std).reshape(shape).astype('float32')
        img = (img * scale - mean) / std

        # ToNCHW
        img = img.transpose((2, 0, 1))
        img = np.expand_dims(img, axis=0)
        return img

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)
    env.set_seed(cfg.Global.seed)
    np.random.seed(1)
    img_path = 'projects/vit/images/demo.jpg'
    img = preprocess(img_path)
    
    if(os.path.exists('shape.pbtxt')==False):
        cfg.Inference.TensorRT.collect_shape = True
        module = build_module(cfg)
        engine = EagerEngine(configs=cfg,module=module, mode='inference')
        outs = engine.inference([img])

    cfg.Inference.TensorRT.collect_shape = False
    module = build_module(cfg)
    config.print_config(cfg)
    engine = EagerEngine(configs=cfg,module=module, mode='inference')
    outs = engine.inference([img])
    res = softmax(outs['linear_99.tmp_1'])
    max_index = np.argmax(res, axis=-1)
    print("类型: ", max_index[0],)
    print("概率: ", res[0][max_index[0]])

    
================================================
FILE: projects/vit/run_finetune.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
#python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml


================================================
FILE: projects/vit/run_finetune_fused_attention.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \
       -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml \
       -o Model.model.use_fused_attn=True


================================================
FILE: projects/vit/run_inference_base_patch16_224.sh
================================================
echo "step 1: download parameters"
mkdir -p ckpt
wget -O ckpt/model.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams

echo "step 2: export model"
python tools/export.py \
    -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml \
    -o Engine.save_load.ckpt_dir=./ckpt/

echo "step 3: run VIT inference"
python projects/vit/inference.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml


================================================
FILE: projects/vit/run_pretrain.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml


================================================
FILE: projects/vit/run_pretrained_fused_attention.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \
       -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml \
       -o Model.model.use_fused_attn=True


================================================
FILE: projects/vit/run_qat.sh
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \
    -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \
    -o Model.model.drop_rate=0.0 \
    -o Data.Train.sampler.batch_size=16 \
    -o Optimizer.lr.learning_rate=5e-05 \
    -o Optimizer.weight_decay=0.0002


================================================
FILE: requirements.txt
================================================
paddleslim @ https://paddle-qa.bj.bcebos.com/PaddleSlim/paddleslim-0.0.0.dev0-py3-none-any.whl
paddlenlp @ https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl
requests==2.25.1
regex==2022.7.25
colorlog==6.6.0
colorama==0.4.5
omegaconf==2.2.2
tqdm>=4.62.1
pybind11==2.10.0
numpy>=1.19.5,<=1.21.6
opencv-python>=4.2.0.32
Pillow==9.3.0
blobfile==1.3.3


================================================
FILE: setup.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from setuptools import setup, Extension, find_packages

from ppfleetx.data.data_tools.cpp.compile import compile_helper
compile_helper()


def fetch_requirements(path):
    with open(path, 'r') as fd:
        return [r.strip() for r in fd.readlines()]


install_requires = fetch_requirements('requirements.txt')

setup(
    name='ppfleetx',
    version='0.0.0',
    description='PaddleFleetX',
    author='PaddlePaddle Authors',
    url='https://github.com/PaddlePaddle/PaddleFleetX',
    install_requires=install_requires,
    package_data={
        'ppfleetx.data.data_tools.cpp': ['fast_index_map_helpers.so']
    },
    packages=find_packages())


================================================
FILE: tasks/gpt/generation.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import math
import os
import random
import time
import sys
import yaml
import numpy as np

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))

from ppfleetx.utils import config
from ppfleetx.models import build_module
from ppfleetx.distributed.apis import env

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)

    if dist.get_world_size() > 1:
        env.init_dist_env(cfg)

    env.set_seed(cfg.Global.seed)

    module = build_module(cfg)
    config.print_config(cfg)

    module.model.eval()

    ckpt_dir = cfg.Engine.save_load.ckpt_dir
    if ckpt_dir is not None:
        model_path = os.path.join(ckpt_dir, "model.pdparams")
        model_dict = paddle.load(model_path)

        for key, value in model_dict.items():
            model_dict[key] = model_dict[key].astype(paddle.float32)

        module.model.set_state_dict(model_dict)

    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'
    result = module.generate(input_text)

    print(f'Prompt: {input_text}')
    print(f'Generation: {result[0]}')


================================================
FILE: tasks/gpt/inference.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))

from ppfleetx.utils import config
from ppfleetx.utils.log import logger
from ppfleetx.data import build_dataloader, tokenizers
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine
from ppfleetx.distributed.apis import env

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)

    if dist.get_world_size() > 1:
        env.init_dist_env(cfg)

    env.set_seed(cfg.Global.seed)

    module = build_module(cfg)
    config.print_config(cfg)

    tokenizer = tokenizers.GPTTokenizer.from_pretrained("gpt2")
    engine = EagerEngine(configs=cfg, module=module, mode='inference')

    input_text = 'Hi, GPT2. Tell me who Jack Ma is.'
    input_ids = [tokenizer.encode(input_text)]

    outs = engine.inference([input_ids])

    ids = list(outs.values())[0]
    out_ids = [int(x) for x in ids[0]]
    result = tokenizer.decode(out_ids)
    result = input_text + result

    print('Prompt:', input_text)
    print('Generation:', result)


================================================
FILE: tasks/gpt/run_generation.sh
================================================
#!/usr/bin/env bash

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# for single card generation

export CUDA_VISIBLE_DEVICES=0
python tasks/gpt/generation.py -c ./ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml


================================================
FILE: tools/auto.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import copy
import random
import paddle
import numpy as np
import paddle.distributed as dist

from paddle.distributed import fleet

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.utils.log import logger
from ppfleetx.models import build_module
from ppfleetx.data import build_auto_dataset
from ppfleetx.core import AutoEngine

#init_logger()

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_auto_config(
        args.config, overrides=args.override, show=False)

    if dist.get_world_size() > 1:
        fleet.init(is_collective=True)

    module = build_module(cfg)
    config.print_config(cfg)

    train_data = build_auto_dataset(cfg.Data, "Train")
    eval_data = build_auto_dataset(cfg.Data, "Eval")

    cfg.Optimizer.lr.update({
        'epochs': cfg.Engine.num_train_epochs,
        'step_each_epoch': len(train_data)
    })

    engine = AutoEngine(configs=cfg, module=module)

    if cfg.Engine.save_load.ckpt_dir is not None:
        engine.load()

    if cfg.get('Tuning', None) and cfg.Tuning.enable:
        engine.tune(train_data)
    else:
        engine.fit(train_dataset=train_data,
                   valid_dataset=eval_data,
                   epoch=cfg.Engine.num_train_epochs)


================================================
FILE: tools/auto_export.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import copy
import random
import paddle
import numpy as np

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.models import build_module
from ppfleetx.core import AutoEngine

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_auto_config(
        args.config, overrides=args.override, show=False)

    if cfg.get('Model', None) is not None:
        module = build_module(cfg)
        config.print_config(cfg)

        engine = AutoEngine(configs=cfg, module=module, mode="export")

        if cfg.Engine.save_load.ckpt_dir is not None:
            engine.load()

        engine.export()
    else:
        engine = AutoEngine(configs=cfg, mode="export")
        if cfg.Engine.save_load.ckpt_dir is None:
            raise ValueError("invalid ckpt_dir.")

        engine.export_from_prog()


================================================
FILE: tools/eval.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.data import build_dataloader
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine
from ppfleetx.distributed.apis import env

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)

    if dist.get_world_size() > 1:
        env.init_dist_env(cfg)

    env.set_seed(cfg.Global.seed)

    module = build_module(cfg)
    config.print_config(cfg)

    engine = EagerEngine(configs=cfg, module=module, mode='eval')

    valid_data_loader = build_dataloader(cfg.Data, "Eval")

    if cfg.Engine.save_load.ckpt_dir is not None:
        engine.load()

    engine.evaluate(
        valid_data_loader=valid_data_loader, epoch=cfg.Engine.num_train_epochs)


================================================
FILE: tools/export.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine
from ppfleetx.distributed.apis import env

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)

    if dist.get_world_size() > 1:
        env.init_dist_env(cfg)

    env.set_seed(cfg.Global.seed)

    module = build_module(cfg)
    config.print_config(cfg)

    engine = EagerEngine(configs=cfg, module=module, mode='export')

    if cfg.Engine.save_load.ckpt_dir is not None:
        engine.load()

    engine.export()


================================================
FILE: tools/inference.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.utils.log import logger
from ppfleetx.data import build_dataloader
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine
from ppfleetx.distributed.apis import env

# init_logger()

if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)

    if dist.get_world_size() > 1:
        env.init_dist_env(cfg)

    env.set_seed(cfg.Global.seed)

    module = build_module(cfg)
    config.print_config(cfg)

    engine = EagerEngine(configs=cfg, module=module, mode='inference')

    test_data_loader = build_dataloader(cfg.Data, "Test")
    for iter_id, data in enumerate(test_data_loader()):
        outs = engine.inference(data)

        if iter_id >= cfg.Engine.test_iters:
            break

    logger.info("The inference process is complete.")
    del test_data_loader


================================================
FILE: tools/train.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.utils.log import logger
from ppfleetx.data import build_dataloader
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine
from ppfleetx.distributed.apis import env


def set_default_flags(flags):
    for flag_name, flag_value in flags.items():
        if os.getenv(flag_name) is None:
            paddle.set_flags({flag_name: flag_value})


if __name__ == "__main__":
    args = config.parse_args()
    cfg = config.get_config(args.config, overrides=args.override, show=False)

    paddle.set_device(cfg["Global"]["device"])
    if dist.get_world_size() > 1:
        env.init_dist_env(cfg)

    env.set_seed(cfg.Global.seed)

    module = build_module(cfg)
    config.print_config(cfg)

    train_data_loader = build_dataloader(cfg.Data, "Train")
    eval_data_loader = build_dataloader(cfg.Data, "Eval")

    cfg.Optimizer.lr.update({
        'epochs': cfg.Engine.num_train_epochs,
        'step_each_epoch': len(train_data_loader),
        'total_steps': cfg.Engine.max_steps,
    })

    engine = EagerEngine(configs=cfg, module=module)

    if cfg.Engine.save_load.ckpt_dir is not None:
        engine.load()

    engine.fit(train_data_loader=train_data_loader,
               valid_data_loader=eval_data_loader,
               epoch=cfg.Engine.num_train_epochs)