Repository: PaddlePaddle/PaddleFleetX Branch: develop Commit: 20f33ad21e9d Files: 507 Total size: 2.6 MB Directory structure: gitextract_it7z4sjw/ ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── README.md ├── benchmarks/ │ ├── README.md │ └── test_tipc/ │ ├── ernie/ │ │ └── dygraph/ │ │ └── hybrid_parallel/ │ │ ├── N1C1/ │ │ │ ├── ernie_bs16_fp16_DP1-MP1-PP1.sh │ │ │ └── ernie_bs16_fp32_DP1-MP1-PP1.sh │ │ ├── N1C8/ │ │ │ ├── ernie_bs16_fp16_DP2-MP2-PP2.sh │ │ │ └── ernie_bs16_fp32_DP2-MP2-PP2.sh │ │ ├── N4C32/ │ │ │ ├── ernie_bs16_fp16_DP1-MP8-PP4.sh │ │ │ ├── ernie_bs16_fp16_DP2-MP8-PP2.sh │ │ │ ├── ernie_bs16_fp16_DP4-MP8-PP1.sh │ │ │ ├── ernie_bs16_fp32_DP1-MP8-PP4.sh │ │ │ ├── ernie_bs16_fp32_DP2-MP8-PP2.sh │ │ │ └── ernie_bs16_fp32_DP4-MP8-PP1.sh │ │ └── benchmark_common/ │ │ ├── prepare.sh │ │ └── run_benchmark.sh │ ├── gpt/ │ │ ├── dygraph/ │ │ │ ├── data_parallel/ │ │ │ │ ├── N1C8/ │ │ │ │ │ ├── gpt_1024_bs64_fp16_DP8-MP1-PP1.sh │ │ │ │ │ ├── gpt_1024_flash_bs64_fp16_DP8-MP1-PP1.sh │ │ │ │ │ └── gpt_2048_bs64_fp16_DP8-MP1-PP1.sh │ │ │ │ └── benchmark_common/ │ │ │ │ ├── prepare.sh │ │ │ │ └── run_benchmark.sh │ │ │ ├── finetune/ │ │ │ │ ├── N1C1/ │ │ │ │ │ ├── CE_gpt_finetune_CoLA_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_MRPC_acc_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_MRPC_f1_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_QNLI_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_RTE_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_SST2_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_STSB_pearson_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ ├── CE_gpt_finetune_STSB_spearman_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ │ └── CE_gpt_finetune_WNLI_bs32_fp16_DP1-MP1-PP1.sh │ │ │ │ └── benchmark_common/ │ │ │ │ ├── prepare.sh │ │ │ │ └── run_benchmark.sh │ │ │ ├── hybrid_parallel/ │ │ │ │ ├── N1C1/ │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP1-PP1.sh │ │ │ │ │ └── gpt_bs16_fp32_DP1-MP1-PP1.sh │ │ │ │ ├── N1C4/ │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP1-PP4.sh │ │ │ │ │ └── gpt_bs16_fp16_DP1-MP4-PP1.sh │ │ │ │ ├── N1C8/ │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP1-PP8.sh │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP2-PP4.sh │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP4-PP2.sh │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP8-PP1.sh │ │ │ │ │ ├── gpt_bs16_fp16_DP2-MP2-PP2.sh │ │ │ │ │ ├── gpt_bs16_fp32_DP2-MP2-PP2.sh │ │ │ │ │ ├── gpt_bs64_fp16_DP8-MP1-PP1.sh │ │ │ │ │ ├── gpt_bs64_fp32_DP8-MP1-PP1.sh │ │ │ │ │ ├── gpt_recompute_bs16_fp16_DP2-MP2-PP2.sh │ │ │ │ │ └── gpt_recompute_bs16_fp32_DP2-MP2-PP2.sh │ │ │ │ ├── N4C32/ │ │ │ │ │ ├── gpt_bs16_fp16_DP1-MP8-PP4.sh │ │ │ │ │ ├── gpt_bs16_fp16_DP2-MP8-PP2.sh │ │ │ │ │ ├── gpt_bs16_fp16_DP4-MP8-PP1.sh │ │ │ │ │ ├── gpt_bs16_fp32_DP1-MP8-PP4.sh │ │ │ │ │ ├── gpt_bs16_fp32_DP2-MP8-PP2.sh │ │ │ │ │ └── gpt_bs16_fp32_DP4-MP8-PP1.sh │ │ │ │ └── benchmark_common/ │ │ │ │ ├── prepare.sh │ │ │ │ └── run_benchmark.sh │ │ │ ├── sequence_parallel/ │ │ │ │ ├── N1C8/ │ │ │ │ │ ├── gpt_sp_False_bs8_fp16_DP1-MP8-PP1.sh │ │ │ │ │ └── gpt_sp_True_bs8_fp16_DP1-MP8-PP1.sh │ │ │ │ ├── N4C32/ │ │ │ │ │ ├── gpt_sp_False_bs16_fp16_DP2-MP8-PP2.sh │ │ │ │ │ └── gpt_sp_True_bs16_fp16_DP2-MP8-PP2.sh │ │ │ │ └── benchmark_common/ │ │ │ │ ├── prepare.sh │ │ │ │ └── run_benchmark.sh │ │ │ └── sharding/ │ │ │ ├── N1C2/ │ │ │ │ ├── gpt_stage2_bs16_fp16_DP1-MP1-PP1-Sharding2.sh │ │ │ │ ├── gpt_stage3_bs16_fp16_DP1-MP1-PP1-Sharding2.sh │ │ │ │ └── gpt_stage3_bs16_fp32_DP1-MP1-PP1-Sharding2.sh │ │ │ ├── N2C16/ │ │ │ │ └── gpt_stage2_bs128_fp16_DP1-MP1-PP1-Sharding16.sh │ │ │ └── benchmark_common/ │ │ │ ├── prepare.sh │ │ │ └── run_benchmark.sh │ │ └── static/ │ │ └── auto_parallel/ │ │ ├── N1C1/ │ │ │ └── gpt_auto_recompute_bs8_fp32_DP1-MP1-PP1.sh │ │ └── benchmark_common/ │ │ ├── prepare.sh │ │ └── run_benchmark.sh │ ├── imagen/ │ │ └── dygraph/ │ │ ├── N1C1/ │ │ │ ├── imagen_397M_text2im_64_bs1_fp32_DP1-MP1-PP1.sh │ │ │ └── imagen_SR256_bs1_fp32_DP1-MP1-PP1.sh │ │ ├── N1C8/ │ │ │ ├── imagen_2B_text2im_64_bs8_fp32_DP1-Sharding8.sh │ │ │ ├── imagen_397M_text2im_64_bs8_fp32_DP8-MP1-PP1.sh │ │ │ ├── imagen_SR256_bs8_fp32_DP8-MP1-PP1.sh │ │ │ └── imagen_text2im_64_debertav2_bs8_fp32_DP8-MP1-PP1.sh │ │ └── benchmark_common/ │ │ ├── prepare.sh │ │ └── run_benchmark.sh │ └── vit/ │ └── dygraph/ │ ├── finetune/ │ │ ├── N1C8/ │ │ │ ├── ViT_large_patch16_384_ft_fused_False_bs512_fp16_DP.sh │ │ │ └── ViT_large_patch16_384_ft_fused_True_bs512_fp16_DP.sh │ │ └── benchmark_common/ │ │ ├── prepare.sh │ │ └── run_benchmark.sh │ └── pretrained/ │ ├── N2C16/ │ │ ├── ViT_large_patch16_224_pt_fused_False_bs128_fp16_DP.sh │ │ └── ViT_large_patch16_224_pt_fused_True_bs128_fp16_DP.sh │ └── benchmark_common/ │ ├── prepare.sh │ └── run_benchmark.sh ├── codestyle/ │ ├── .gitignore │ ├── clang_format.hook │ ├── copyright.hook │ ├── cpplint_pre_commit.hook │ ├── docstring_checker.py │ ├── pylint_pre_commit.hook │ └── test_docstring_checker.py ├── docs/ │ ├── cluster_deployment.md │ ├── compression.md │ ├── deployment_faq.md │ ├── docker_install.md │ ├── quick_start.md │ └── standard.md ├── examples/ │ └── transformer/ │ ├── __init__.py │ ├── models/ │ │ └── GPT/ │ │ ├── docs/ │ │ │ ├── README.md │ │ │ ├── hybrid_parallel.md │ │ │ ├── hybrid_profiler.md │ │ │ ├── inference.md │ │ │ ├── quantization_aware_training.md │ │ │ ├── single_card.md │ │ │ ├── single_finetune.md │ │ │ └── structured_pruning.md │ │ ├── finetune/ │ │ │ ├── configs/ │ │ │ │ ├── finetune_gpt_345M_single_card_glue.yaml │ │ │ │ └── finetune_gpt_base.yaml │ │ │ ├── impls.py │ │ │ ├── run.py │ │ │ └── run_task.sh │ │ ├── generation/ │ │ │ ├── configs/ │ │ │ │ ├── generation_gpt_345M_dp8.yaml │ │ │ │ ├── generation_gpt_345M_single_card.yaml │ │ │ │ ├── generation_gpt_base.yaml │ │ │ │ ├── generation_pruned_gpt_345M_single_card.yaml │ │ │ │ ├── generation_qat_gpt_345M_single_card.yaml │ │ │ │ ├── generation_qat_gpt_6.7B_single_card.yaml │ │ │ │ ├── inference_gpt_345M_dp8.yaml │ │ │ │ └── inference_gpt_345M_single_card.yaml │ │ │ ├── export.py │ │ │ ├── impls.py │ │ │ ├── inference.py │ │ │ └── run.py │ │ ├── offline-eval/ │ │ │ ├── configs/ │ │ │ │ ├── eval_gpt_345M_single_card.yaml │ │ │ │ ├── eval_gpt_base.yaml │ │ │ │ ├── eval_pruned_gpt_345M_single_card.yaml │ │ │ │ └── eval_qat_gpt_345M_single_card.yaml │ │ │ ├── impls.py │ │ │ └── run.py │ │ ├── pretrain/ │ │ │ ├── configs/ │ │ │ │ ├── export_qat_gpt_345M_single_card.yaml │ │ │ │ ├── pretrain_gpt_1.3B_dp8.yaml │ │ │ │ ├── pretrain_gpt_1.3B_single_card.yaml │ │ │ │ ├── pretrain_gpt_175B_mp8_pp16.yaml │ │ │ │ ├── pretrain_gpt_345M_single_card.yaml │ │ │ │ ├── pretrain_gpt_6.7B_sharding16.yaml │ │ │ │ ├── pretrain_gpt_base.yaml │ │ │ │ ├── pretrain_gpt_cn_345M_single_card.yaml │ │ │ │ ├── prune_gpt_345M_single_card.yaml │ │ │ │ ├── qat_gpt_345M_mp8.yaml │ │ │ │ ├── qat_gpt_345M_single_card.yaml │ │ │ │ └── qat_gpt_6.7B_sharding16.yaml │ │ │ ├── export.py │ │ │ ├── impls.py │ │ │ └── run.py │ │ └── pretrain_moe/ │ │ ├── configs/ │ │ │ ├── pretrain_moe_345M_single_card.yaml │ │ │ └── pretrain_moe_base.yaml │ │ ├── impls.py │ │ └── run.py │ └── utils/ │ ├── __init__.py │ ├── components.py │ ├── config.py │ └── qat.py ├── ppfleetx/ │ ├── __init__.py │ ├── configs/ │ │ ├── multimodal/ │ │ │ └── imagen/ │ │ │ ├── imagen_397M_text2im_64x64.yaml │ │ │ ├── imagen_base.yaml │ │ │ ├── imagen_super_resolution_1024.yaml │ │ │ ├── imagen_super_resolution_256.yaml │ │ │ ├── imagen_text2im_64x64_DebertaV2.yaml │ │ │ └── imagen_text2im_64x64_T5-11B.yaml │ │ ├── nlp/ │ │ │ ├── ernie/ │ │ │ │ ├── auto/ │ │ │ │ │ ├── finetune_ernie_345M_single_card.yaml │ │ │ │ │ ├── finetune_ernie_base.yaml │ │ │ │ │ ├── pretrain_ernie_base.yaml │ │ │ │ │ └── pretrain_ernie_base_345M_single_card.yaml │ │ │ │ ├── finetune_ernie_345M_single_card.yaml │ │ │ │ ├── finetune_ernie_base.yaml │ │ │ │ ├── inference_ernie_345M_single_card.yaml │ │ │ │ ├── pretrain_ernie_base.yaml │ │ │ │ ├── pretrain_ernie_base_175B_mp8_pp16.yaml │ │ │ │ ├── pretrain_ernie_base_345M_single_card.yaml │ │ │ │ ├── pretrain_ernie_base_3D.yaml │ │ │ │ ├── pretrain_ernie_base_6.7B_sharding16.yaml │ │ │ │ ├── pretrain_ernie_large_single_card.yaml │ │ │ │ └── qat_ernie_base.yaml │ │ │ ├── gpt/ │ │ │ │ ├── auto/ │ │ │ │ │ ├── export_gpt_fp16_single_card.yaml │ │ │ │ │ ├── generation_gpt_175B_mp8.yaml │ │ │ │ │ ├── generation_gpt_345M_mp2.yaml │ │ │ │ │ ├── generation_gpt_345M_single_card.yaml │ │ │ │ │ ├── generation_gpt_6.7B_mp1.yaml │ │ │ │ │ ├── pretrain_gpt_1.3B_dp8.yaml │ │ │ │ │ ├── pretrain_gpt_1.3B_dp8_tuning.yaml │ │ │ │ │ ├── pretrain_gpt_1.3B_single_card.yaml │ │ │ │ │ ├── pretrain_gpt_345M_single_card.yaml │ │ │ │ │ ├── pretrain_gpt_6.7B_sharding16.yaml │ │ │ │ │ ├── pretrain_gpt_base.yaml │ │ │ │ │ └── qat_generation_gpt_345M_mp2.yaml │ │ │ │ ├── eval_gpt_345M_single_card.yaml │ │ │ │ ├── eval_pruned_gpt_345M_single_card.yaml │ │ │ │ ├── eval_qat_gpt_345M_single_card.yaml │ │ │ │ ├── export_qat_gpt_345M_single_card.yaml │ │ │ │ ├── finetune_gpt_345M_single_card_glue.yaml │ │ │ │ ├── finetune_gpt_base.yaml │ │ │ │ ├── generation_gpt_345M_dp8.yaml │ │ │ │ ├── generation_gpt_345M_mp1.yaml │ │ │ │ ├── generation_gpt_345M_single_card.yaml │ │ │ │ ├── generation_gpt_6.7B_single_mp1.yaml │ │ │ │ ├── generation_pruned_gpt_345M_single_card.yaml │ │ │ │ ├── generation_qat_gpt_345M_single_card.yaml │ │ │ │ ├── generation_qat_gpt_6.7B_single_card.yaml │ │ │ │ ├── inference_gpt_345M_dp8.yaml │ │ │ │ ├── inference_gpt_345M_single_card.yaml │ │ │ │ ├── pretrain_gpt_1.3B_dp8.yaml │ │ │ │ ├── pretrain_gpt_1.3B_single_card.yaml │ │ │ │ ├── pretrain_gpt_13B_dp8.yaml │ │ │ │ ├── pretrain_gpt_175B_mp8_pp16.yaml │ │ │ │ ├── pretrain_gpt_345M_single_card.yaml │ │ │ │ ├── pretrain_gpt_6.7B_sharding16.yaml │ │ │ │ ├── pretrain_gpt_6.7B_single_card.yaml │ │ │ │ ├── pretrain_gpt_base.yaml │ │ │ │ ├── pretrain_gpt_cn_345M_single_card.yaml │ │ │ │ ├── prune_gpt_345M_single_card.yaml │ │ │ │ ├── qat_gpt_345M_mp8.yaml │ │ │ │ ├── qat_gpt_345M_single_card.yaml │ │ │ │ └── qat_gpt_6.7B_sharding16.yaml │ │ │ └── moe/ │ │ │ ├── pretrain_moe_1.3B_dp8.yaml │ │ │ └── pretrain_moe_base.yaml │ │ └── vis/ │ │ ├── base.yaml │ │ ├── moco/ │ │ │ ├── moco_lincls_in1k_1n8c.yaml │ │ │ ├── mocov1_pt_in1k_1n8c.yaml │ │ │ └── mocov2_pt_in1k_1n8c.yaml │ │ └── vit/ │ │ ├── ViT_base_patch16_224_inference.yaml │ │ ├── ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml │ │ ├── ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml │ │ ├── ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml │ │ ├── ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml │ │ ├── ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml │ │ ├── ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml │ │ ├── ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml │ │ └── auto/ │ │ ├── ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml │ │ └── base.yaml │ ├── core/ │ │ ├── __init__.py │ │ ├── engine/ │ │ │ ├── __init__.py │ │ │ ├── auto_engine.py │ │ │ ├── basic_engine.py │ │ │ ├── eager_engine.py │ │ │ └── inference_engine.py │ │ └── module/ │ │ ├── __init__.py │ │ └── basic_module.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_tools/ │ │ │ ├── __init__.py │ │ │ ├── cpp/ │ │ │ │ ├── Makefile │ │ │ │ ├── __init__.py │ │ │ │ ├── compile.py │ │ │ │ └── fast_index_map_helpers.cpp │ │ │ ├── ernie/ │ │ │ │ ├── __init__.py │ │ │ │ └── preprocess/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── create_pretraining_data.py │ │ │ │ ├── docs/ │ │ │ │ │ ├── CLUECorpus2020.md │ │ │ │ │ ├── CLUECorpusSmall.md │ │ │ │ │ ├── OpenWebText2.md │ │ │ │ │ └── WuDaoCorpusBase.md │ │ │ │ ├── trans_to_json.py │ │ │ │ └── words_segmentation.py │ │ │ └── gpt/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── preprocess_data.py │ │ │ └── raw_trans_to_json.py │ │ ├── dataset/ │ │ │ ├── __init__.py │ │ │ ├── ernie/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataset_utils.py │ │ │ │ └── ernie_dataset.py │ │ │ ├── glue_dataset.py │ │ │ ├── gpt_dataset.py │ │ │ ├── multimodal_dataset.py │ │ │ └── vision_dataset.py │ │ ├── sampler/ │ │ │ ├── __init__.py │ │ │ ├── batch_sampler.py │ │ │ └── collate.py │ │ ├── tokenizers/ │ │ │ ├── __init__.py │ │ │ ├── debertav2_tokenizer.py │ │ │ ├── ernie_tokenizer.py │ │ │ ├── gpt_tokenizer.py │ │ │ ├── t5_tokenization_utils.py │ │ │ ├── t5_tokenizer.py │ │ │ └── tokenization_utils_base.py │ │ ├── transforms/ │ │ │ ├── __init__.py │ │ │ ├── preprocess.py │ │ │ └── utils.py │ │ └── utils/ │ │ ├── __init__.py │ │ └── batch_collate_fn.py │ ├── distributed/ │ │ ├── __init__.py │ │ ├── apis/ │ │ │ ├── __init__.py │ │ │ ├── amp.py │ │ │ ├── comm_groups.py │ │ │ ├── env.py │ │ │ ├── io.py │ │ │ └── strategy.py │ │ └── protein_folding/ │ │ ├── __init__.py │ │ ├── bp.py │ │ ├── dap.py │ │ ├── dp.py │ │ └── scg.py │ ├── models/ │ │ ├── __init__.py │ │ ├── language_model/ │ │ │ ├── __init__.py │ │ │ ├── auto_utils.py │ │ │ ├── debertav2/ │ │ │ │ ├── __init__.py │ │ │ │ └── modeling.py │ │ │ ├── ernie/ │ │ │ │ ├── __init__.py │ │ │ │ ├── auto/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── auto_model.py │ │ │ │ │ ├── auto_module.py │ │ │ │ │ └── auto_transformer.py │ │ │ │ ├── dygraph/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── hybrid_model.py │ │ │ │ │ └── single_model.py │ │ │ │ ├── ernie_module.py │ │ │ │ ├── finetune_configs.yaml │ │ │ │ └── layers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── distributed_transformer.py │ │ │ │ ├── model_outputs.py │ │ │ │ ├── transformer.py │ │ │ │ └── utils.py │ │ │ ├── gpt/ │ │ │ │ ├── __init__.py │ │ │ │ ├── auto/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── auto_model.py │ │ │ │ │ └── auto_module.py │ │ │ │ └── dygraph/ │ │ │ │ ├── __init__.py │ │ │ │ ├── hybrid_model.py │ │ │ │ ├── processor.py │ │ │ │ ├── sequence_parallel_utils.py │ │ │ │ └── single_model.py │ │ │ ├── language_module.py │ │ │ ├── metrics.py │ │ │ ├── moe/ │ │ │ │ ├── __init__.py │ │ │ │ ├── comm/ │ │ │ │ │ └── __init__.py │ │ │ │ ├── comm_ops.py │ │ │ │ ├── gate/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base_gate.py │ │ │ │ │ ├── gshard_gate.py │ │ │ │ │ ├── naive_gate.py │ │ │ │ │ └── switch_gate.py │ │ │ │ ├── moe_layer.py │ │ │ │ └── utils.py │ │ │ ├── moe_exp/ │ │ │ │ ├── __init__.py │ │ │ │ ├── experts.py │ │ │ │ ├── layer.py │ │ │ │ ├── mappings.py │ │ │ │ └── sharded_moe.py │ │ │ ├── t5/ │ │ │ │ ├── __init__.py │ │ │ │ ├── modeling.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── multimodal_model/ │ │ │ ├── __init__.py │ │ │ ├── clip/ │ │ │ │ └── __init__.py │ │ │ ├── imagen/ │ │ │ │ ├── __init__.py │ │ │ │ ├── modeling.py │ │ │ │ ├── unet.py │ │ │ │ └── utils.py │ │ │ ├── multimodal_module.py │ │ │ └── utils.py │ │ ├── protein_folding/ │ │ │ ├── __init__.py │ │ │ ├── all_atom.py │ │ │ ├── attentions.py │ │ │ ├── common.py │ │ │ ├── evoformer.py │ │ │ ├── outer_product_mean.py │ │ │ ├── quat_affine.py │ │ │ ├── r3.py │ │ │ ├── residue_constants.py │ │ │ └── template.py │ │ └── vision_model/ │ │ ├── __init__.py │ │ ├── factory.py │ │ ├── general_classification_module.py │ │ ├── layers/ │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── droppath.py │ │ │ ├── embedding.py │ │ │ ├── identity.py │ │ │ ├── initializer.py │ │ │ └── mlp.py │ │ ├── loss/ │ │ │ ├── __init__.py │ │ │ └── cross_entropy.py │ │ ├── metrics/ │ │ │ ├── __init__.py │ │ │ └── accuracy.py │ │ ├── moco/ │ │ │ ├── __init__.py │ │ │ └── moco.py │ │ ├── moco_module.py │ │ ├── resnet/ │ │ │ └── __init__.py │ │ └── vit/ │ │ ├── __init__.py │ │ └── vit.py │ ├── ops/ │ │ ├── setup_cuda.py │ │ ├── test_topp_sampling.py │ │ └── topp_sampling.cu │ ├── optims/ │ │ ├── __init__.py │ │ ├── grad_clip.py │ │ ├── lr_scheduler.py │ │ └── optimizer.py │ ├── tools/ │ │ ├── __init__.py │ │ └── multiprocess_tool.py │ └── utils/ │ ├── __init__.py │ ├── check.py │ ├── compression_helper.py │ ├── config.py │ ├── device.py │ ├── download.py │ ├── export.py │ ├── file.py │ ├── log.py │ ├── tensor_fusion_helper.py │ └── version.py ├── projects/ │ ├── ernie/ │ │ ├── auto_export_ernie_345M_mp1.sh │ │ ├── auto_export_ernie_345M_mp2.sh │ │ ├── auto_export_ernie_345M_mp2_npu.sh │ │ ├── auto_export_ernie_345M_mp2_xpu.sh │ │ ├── docs/ │ │ │ ├── README.md │ │ │ └── inference.md │ │ ├── export_ernie_345M_single_card.sh │ │ ├── finetune_ernie_345M_single_card.sh │ │ ├── finetune_ernie_345M_single_card_npu.sh │ │ ├── inference.py │ │ ├── pretrain_ernie_base.sh │ │ ├── pretrain_ernie_base_175B_mp8_pp16.sh │ │ ├── pretrain_ernie_base_3D.sh │ │ ├── pretrain_ernie_base_3D_npu.sh │ │ ├── pretrain_ernie_base_6.7B_sharding16.sh │ │ ├── pretrain_ernie_large.sh │ │ ├── pretrain_ernie_large_mp2_mlu.sh │ │ ├── pretrain_ernie_large_mp2_npu.sh │ │ ├── pretrain_ernie_large_mp2_pp2_npu.sh │ │ ├── pretrain_ernie_large_npu.sh │ │ ├── run_inference.sh │ │ ├── run_inference_mp2.sh │ │ ├── run_inference_mp2_npu.sh │ │ └── run_inference_mp2_xpu.sh │ ├── gpt/ │ │ ├── auto_export_gpt_175B_mp8.sh │ │ ├── auto_export_gpt_345M_mp2.sh │ │ ├── auto_export_gpt_345M_single_card.sh │ │ ├── auto_export_gpt_6.7B_mp1.sh │ │ ├── auto_export_gpt_fp16_single_card.sh │ │ ├── auto_gpt_1.3B_dp8.sh │ │ ├── auto_gpt_1.3B_dp8_tuning.sh │ │ ├── auto_gpt_1.3B_single_card.sh │ │ ├── auto_gpt_345M_single_card.sh │ │ ├── auto_gpt_6.7B_sharding16.sh │ │ ├── auto_qat_export_gpt_345M_mp2.sh │ │ ├── benchmark.py │ │ ├── docs/ │ │ │ ├── README.md │ │ │ ├── auto_parallel.md │ │ │ ├── hybrid_parallel.md │ │ │ ├── hybrid_profiler.md │ │ │ ├── inference.md │ │ │ ├── quantization_aware_training.md │ │ │ ├── single_card.md │ │ │ ├── single_finetune.md │ │ │ └── structured_pruning.md │ │ ├── eval_prune_gpt_345M_single_card.sh │ │ ├── eval_qat_gpt_345M_single_card.sh │ │ ├── evaluate_gpt_345M_single_card.sh │ │ ├── export_gpt_345M_single_card.sh │ │ ├── export_prune_gpt_345M_single_card.sh │ │ ├── export_qat_gpt_345M_single_card.sh │ │ ├── finetune_gpt_345M_single_card.sh │ │ ├── inference.py │ │ ├── inference_gpt_6.7B_single_card.sh │ │ ├── inference_gpt_multigpu.sh │ │ ├── inference_gpt_single_card.sh │ │ ├── pretrain_gpt_1.3B_dp8.sh │ │ ├── pretrain_gpt_1.3B_single_card.sh │ │ ├── pretrain_gpt_175B_mp8_pp16.sh │ │ ├── pretrain_gpt_345M_single_card.sh │ │ ├── pretrain_gpt_6.7B_sharding16.sh │ │ ├── prune_gpt_345M_single_card.sh │ │ ├── qat_gpt_345M_mp8.sh │ │ ├── qat_gpt_345M_single_card.sh │ │ ├── qat_gpt_6.7B_sharding16.sh │ │ └── run_benchmark.sh │ ├── imagen/ │ │ ├── README.md │ │ ├── filelist/ │ │ │ └── laion_400M/ │ │ │ └── train │ │ ├── run_super_resolution_1024_sharding128.sh │ │ ├── run_super_resolution_256_dp128.sh │ │ ├── run_super_resolution_256_single_card.sh │ │ ├── run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh │ │ ├── run_text2im_397M_64x64_dp128.sh │ │ ├── run_text2im_397M_64x64_single_card.sh │ │ └── run_text2im_64x64_DebertaV2_dp8.sh │ ├── moco/ │ │ ├── README.md │ │ ├── run_mocov1_lincls_in1k.sh │ │ ├── run_mocov1_pretrain_in1k.sh │ │ ├── run_mocov2_lincls_in1k.sh │ │ └── run_mocov2_pretrain_in1k.sh │ ├── protein_folding/ │ │ └── README.md │ ├── ufo2.0/ │ │ └── README.md │ └── vit/ │ ├── README.md │ ├── auto_vit_patch16_224_dp8.sh │ ├── docs/ │ │ └── inference.md │ ├── export_qat.sh │ ├── inference.py │ ├── run_finetune.sh │ ├── run_finetune_fused_attention.sh │ ├── run_inference_base_patch16_224.sh │ ├── run_pretrain.sh │ ├── run_pretrained_fused_attention.sh │ └── run_qat.sh ├── requirements.txt ├── setup.py ├── tasks/ │ └── gpt/ │ ├── generation.py │ ├── inference.py │ └── run_generation.sh └── tools/ ├── auto.py ├── auto_export.py ├── eval.py ├── export.py ├── inference.py └── train.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .DS_Store .idea ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/Lucas-C/pre-commit-hooks.git sha: v1.0.1 hooks: - id: remove-crlf files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - repo: https://github.com/PaddlePaddle/mirrors-yapf.git sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 hooks: - id: yapf files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ - repo: https://github.com/pre-commit/pre-commit-hooks sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0 hooks: - id: check-added-large-files - id: check-merge-conflict - id: check-symlinks - id: detect-private-key files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer - repo: local hooks: - id: clang-format-with-version-check name: clang-format description: Format files with ClangFormat. entry: bash ./codestyle/clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: local hooks: - id: cpplint-cpp-source name: cpplint description: Check C++ code style using cpplint.py. entry: bash ./codestyle/cpplint_pre_commit.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ - repo: local hooks: - id: pylint-doc-string name: pylint description: Check python docstring style using docstring_checker. entry: bash ./codestyle/pylint_pre_commit.hook language: system files: \.(py)$ - repo: local hooks: - id: copyright_checker name: copyright_checker entry: python ./codestyle/copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$ exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ ================================================ FILE: Dockerfile ================================================ ARG BASE_IMAGE=registry.baidubce.com/paddlepaddle/paddle:2.4.1-gpu-cuda11.2-cudnn8.2-trt8.0 FROM $BASE_IMAGE WORKDIR /paddle RUN python -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html # RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetx/develop/requirements.txt && python -m pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple COPY requirements.txt /paddle RUN python -m pip install -r requirements.txt #-i https://mirror.baidu.com/pypi/simple ENV LD_LIBRARY_PATH=/usr/lib64/:${LD_LIBRARY_PATH} ================================================ FILE: LICENSE ================================================ Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

------------------------------------------------------------------------------------------

## 简介 PaddleFleetX是基于飞桨深度学习框架开发的大模型套件,旨在提供高性能、灵活易用的大模型全流程应用能力,在**开发**、**训练**、**精调**、**压推**、**推理**、**部署**六大环节提供端到端全流程优化。

飞桨大模型套件

## 特色介绍 ### 大模型开发:动静统一开发模式,4D混合并行策略灵活配置

大模型开发

基于飞桨动静统一的开发模式,大模型套件全面使用动态图开发,在Generate API中可自动完成算子融合具备静态图的调试性能。全场景统一训练器Trainer可以轻松完成4D混合并行的配置,在预训练与精调环节皆可使用。 ### 大模型训练:发挥基础计算潜能、全面提升分布式效率 飞桨针对大模型训练,对数据读取、混合精度计算策略、高性能算子库、并行策略自动寻优、流水线调度的整个全流程实现优化,助力文心大模型训练速度提升3倍。

飞桨支持大模型训练

### 大模型精调:主流精调算法实现性能全面领先 提供了主流的精调算法,包括SFT、Prefix-Tuning、LoRA三种主流的精调算法,有效降低的大模型训练的资源门槛。统一的训练器Trainer实现了预训练加速技术在精调场景的复用,并通过变长数据流优化大幅提升精调性能。

大模型精调

### 大模型压缩:自研量化压缩算法实现无损量化 飞桨自研的Shift-SmoothQuant算法相比SmoothQuant算法可以实现更平滑的激活分布,有效提升量化后模型的精度度和生成结果的稳定性。通过PaddleSlim的大模型压缩工具,我们在 C-Eval 和 NL2SQL 两个数据集上对主流开源大模型可以实现无损量化。更多技术介绍与使用说明可以参考[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)。

模型压缩

模型压缩

### 大模型推理:针对大模型场景特性匹配最优量化推理方案 Paddle Inference针对大模型Prompt阶段与Token Generation阶段的计算特性的不同,在通用场景提供静态量化,在访存受限场景提供混合量化与低比特的推理方案。

飞桨支撑大模型推理

推理引擎

### 大模型部署:实时感知负载动态插入请求,最大化硬件利用率 由于大模型生成场景解码阶段耗时较长,且不同Query下生成长度不一,为了最大化服务吞吐,我们在FastDeploy服务框架结合推理引擎实现了动态插入技术,科实时感知服务负载,动态插入用户请求最大化推理硬件利用率。

大模型服务部署

## PaddleFleetX 应用案例 ### 大语言模型 基于PaddleFleetX的核心能力,我们在PaddleNLP中提供了丰富的大语言模型全流程开发与应用示例,更多详细使用说明可以参考[PaddleNLP大语言模型](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm)。 ### 跨模态大模型 除了大语言模型外,PaddleFleetX还提供跨模态大模型的开发与训练,如多模态预训练、文生图扩散模型等,覆盖图片、文本、视频和音频等模态,更多详细使用说明可以参考[PaddleMIX](https://github.com/PaddlePaddle/PaddleMIX)。 ### 生物计算大模型 在生物计算领域,基于飞桨4D并行策略与高性能优化,我们在PaddleHelix中提供众多业界领先的生物计算预训练模型,更多详细使用说明可以参考[PaddleHelix](https://github.com/PaddlePaddle/PaddleHelix)。 ## Citation ``` @misc{paddlefleetx, title={PaddleFleetX: An Easy-to-use and High-Performance One-stop Tool for Deep Learning}, author={PaddleFleetX Contributors}, howpublished = {\url{https://github.com/PaddlePaddle/PaddleFleetX}}, year={2022} } ``` ## License PaddleFleetX 基于 [Apache 2.0 license](./LICENSE) 许可发布。 ================================================ FILE: benchmarks/README.md ================================================ ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C1/ernie_bs16_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 model=ernie micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C1/ernie_bs16_fp32_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp32 run_mode=DP1-MP1-PP1 device_num=N1C1 model=ernie micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C8/ernie_bs16_fp16_DP2-MP2-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=2 mp_degree=2 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP2-PP2 device_num=N1C8 model=ernie micro_bs=2 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N1C8/ernie_bs16_fp32_DP2-MP2-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=2 mp_degree=2 pp_degree=2 bs_item=16 fp_item=fp32 run_mode=DP2-MP2-PP2 device_num=N1C8 model=ernie micro_bs=2 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP1-MP8-PP4.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=1 mp_degree=8 pp_degree=4 bs_item=16 fp_item=fp16 run_mode=DP1-MP8-PP4 device_num=N4C32 model=ernie micro_bs=2 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP2-MP8-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=2 mp_degree=8 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP8-PP2 device_num=N4C32 model=ernie micro_bs=2 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp16_DP4-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=4 mp_degree=8 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP4-MP8-PP1 device_num=N4C32 model=ernie micro_bs=4 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP1-MP8-PP4.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=1 mp_degree=8 pp_degree=4 bs_item=16 fp_item=fp32 run_mode=DP1-MP8-PP4 device_num=N4C32 model=ernie micro_bs=2 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP2-MP8-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=2 mp_degree=8 pp_degree=2 bs_item=16 fp_item=fp32 run_mode=DP2-MP8-PP2 device_num=N4C32 model=ernie micro_bs=2 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/N4C32/ernie_bs16_fp32_DP4-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=ernie dp_degree=4 mp_degree=8 pp_degree=1 bs_item=16 fp_item=fp32 run_mode=DP4-MP8-PP1 device_num=N4C32 model=ernie micro_bs=4 cd ./benchmarks bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ rm -rf dataset/ernie mkdir -p dataset/ernie wget -O dataset/ernie/cluecorpussmall_14g_1207_ids_part0 https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_ids_part0 wget -O dataset/ernie/cluecorpussmall_14g_1207_ids_part1 https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_ids_part1 cat dataset/ernie/cluecorpussmall_14g_1207_ids_part* &> dataset/ernie/cluecorpussmall_14g_1207_ids.npy wget -O dataset/ernie/cluecorpussmall_14g_1207_idx.npz https://paddlefleetx.bj.bcebos.com/model/nlp/ernie/cluecorpussmall_14g_1207_idx.npz ================================================ FILE: benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="tokens/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" max_iter=${10:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 use_sharding=${11:-"false"} # (可选) 是否使用Sharding num_workers=0 # (可选) base_batch_size=$global_batch_size use_recompute=${12:-"False"} # (可选)是否打开recompute sharding_stage=${13:-"1"} # (可选)sharding case sharding_offload=${14:-"False"} # (可选) eval_freq=${15:-"1000000"} # (可选) sharding_degree=${16:-"1"} # (可选) # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` num_attention_heads=16 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en" num_layers=24 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en" use_pure_fp16=False # fp32 if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi train_cmd="-o Global.seed=1234 \ -o Global.local_batch_size=${local_batch_size} \ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.hidden_size=1024 \ -o Model.num_hidden_layers=${num_layers} \ -o Model.num_attention_heads=${num_attention_heads} \ -o Model.use_recompute=${use_recompute} \ -o Data.Train.dataset.input_dir=./dataset/ernie \ -o Data.Eval.dataset.input_dir=./dataset/ernie \ -o Distributed.dp_degree=${dp_degree} \ -o Distributed.mp_degree=${mp_degree} \ -o Distributed.pp_degree=${pp_degree} \ -o Distributed.sharding.sharding_degree=${sharding_degree} \ -o Distributed.sharding.sharding_stage=${sharding_stage} \ -o Distributed.sharding.sharding_offload=${sharding_offload} \ -o Optimizer.lr.max_lr=1e-4 \ -o Optimizer.lr.min_lr=1e-5 " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 # hybrid_parallelism case case ${run_mode} in DP1-MP1-PP1) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION} \ tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \ ${train_cmd}" workerlog_id=0 ;; DP2-MP1-PP1) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \ ${train_cmd}" workerlog_id=0 ;; DP2-MP2-PP2|DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \ ${train_cmd}" workerlog_id=0 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 timeout 240m ${train_cmd} > ${log_file} 2>&1 else timeout 15m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_1024_bs64_fp16_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_1024 dp_degree=8 mp_degree=1 pp_degree=1 bs_item=64 fp_item=fp16 run_mode=DP8-MP1-PP1 device_num=N1C8 yaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_1024_flash_bs64_fp16_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_1024_flash dp_degree=8 mp_degree=1 pp_degree=1 bs_item=64 fp_item=fp16 run_mode=DP8-MP1-PP1 device_num=N1C8 yaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/N1C8/gpt_2048_bs64_fp16_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_2048 dp_degree=8 mp_degree=1 pp_degree=1 bs_item=64 fp_item=fp16 run_mode=DP8-MP1-PP1 device_num=N1C8 yaml_path=./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ rm -rf data mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/data_parallel/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="tokens/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" yaml_path=${10:-"./pretrain/configs/pretrain_gpt_345M_single_card.yaml"} max_iter=${11:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=0 # (可选) base_batch_size=$global_batch_size eval_freq=${12:-"1000"} # (可选)模型评估间隔 use_recompute=${13:-"False"} # (可选)是否打开recompute # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi if [ ${model_item} = "gpt_1024_flash" ];then args="-o Model.use_flash_attn=True" else args="" fi train_cmd="-c ${yaml_path} ${args} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ -o Engine.save_load.save_steps=100000 \ -o Distributed.dp_degree=${dp_degree} \ " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in DP8-MP1-PP1) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ tools/train.py \ ${train_cmd}" workerlog_id=0 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 15m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_CoLA_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_CoLA dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=mcc: dataset=CoLA model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_MRPC_acc_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_MRPC_acc dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=acc: dataset=MRPC model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_MRPC_f1_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_MRPC_f1 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=f1: dataset=MRPC model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_QNLI_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_QNLI dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=acc: dataset=QNLI model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_RTE_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_RTE dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=acc: dataset=RTE model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_SST2_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_SST2 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=acc: dataset=SST2 model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_STSB_pearson_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_STSB_pearson dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=pearson: dataset=STSB model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_STSB_spearman_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_STSB_spearman dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=spearman: dataset=STSB model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/N1C1/CE_gpt_finetune_WNLI_bs32_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=CE_gpt_finetune_WNLI dp_degree=1 mp_degree=1 pp_degree=1 bs_item=32 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 convergence_key=acc: dataset=WNLI model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh # run sed -i "s/num_train_epochs=5/num_train_epochs=20/g" ../projects/gpt/finetune_gpt_345M_single_card.sh bash ./test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${convergence_key} ${dataset} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get ckpt cd ../ rm -rf ckpt mkdir -p ckpt wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/ ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/finetune/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="steps/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key=${10:-"loss:"} # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" dataset=${11:-"CoLA"} # 数据集 max_iter=${12:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 base_batch_size=$global_batch_size sharding_degree=${13-"1"} # (可选) sharding_stage=${14:-"1"} # (可选)sharding case # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH # if [ ${model_item} = "gpt3_moe" ];then # static_scripts="../examples/language_model/gpt-moe/dygraph/" # else # echo "not supported model item: ${model_item}"; exit 1; # fi echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi # data_path="./data/" local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` train_cmd="${dataset}" # 以下为通用执行命令,无特殊可不用修改 # hybrid_parallelism case case ${run_mode} in DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" train_cmd="bash projects/gpt/finetune_gpt_345M_single_card.sh \ ${train_cmd}" ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" workerlog_id=0 timeout 40m ${train_cmd} > ${log_file} 2>&1 if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C1/gpt_bs16_fp16_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP1-MP1-PP1 device_num=N1C1 model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C1/gpt_bs16_fp32_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp32 run_mode=DP1-MP1-PP1 device_num=N1C1 model=gpt micro_bs=${bs_item} cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C4/gpt_bs16_fp16_DP1-MP1-PP4.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=1 pp_degree=4 bs_item=16 fp_item=fp16 run_mode=DP1-MP1-PP4 device_num=N1C4 model=gpt micro_bs=2 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C4/gpt_bs16_fp16_DP1-MP4-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=4 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP1-MP4-PP1 device_num=N1C4 model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP1-PP8.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=1 pp_degree=8 bs_item=16 fp_item=fp16 run_mode=DP1-MP1-PP8 device_num=N1C8 model=gpt micro_bs=2 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP2-PP4.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=2 pp_degree=4 bs_item=16 fp_item=fp16 run_mode=DP1-MP2-PP4 device_num=N1C8 model=gpt micro_bs=2 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP4-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=4 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP1-MP4-PP2 device_num=N1C8 model=gpt micro_bs=2 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP1-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=8 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP1-MP8-PP1 device_num=N1C8 model=gpt micro_bs=16 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp16_DP2-MP2-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=2 mp_degree=2 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP2-PP2 device_num=N1C8 model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs16_fp32_DP2-MP2-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=2 mp_degree=2 pp_degree=2 bs_item=16 fp_item=fp32 run_mode=DP2-MP2-PP2 device_num=N1C8 model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs64_fp16_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=8 mp_degree=1 pp_degree=1 bs_item=64 fp_item=fp16 run_mode=DP8-MP1-PP1 device_num=N1C8 max_iter=500 use_recompute=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${max_iter} ${use_recompute} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_bs64_fp32_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=8 mp_degree=1 pp_degree=1 bs_item=64 fp_item=fp32 run_mode=DP8-MP1-PP1 device_num=N1C8 max_iter=500 use_recompute=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${max_iter} ${use_recompute} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_recompute_bs16_fp16_DP2-MP2-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_recompute dp_degree=2 mp_degree=2 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP2-PP2 device_num=N1C8 max_iter=500 use_recompute=True model=gpt micro_bs=2 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${max_iter} ${use_recompute} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N1C8/gpt_recompute_bs16_fp32_DP2-MP2-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_recompute dp_degree=2 mp_degree=2 pp_degree=2 bs_item=16 fp_item=fp32 run_mode=DP2-MP2-PP2 device_num=N1C8 max_iter=500 use_recompute=True model=gpt micro_bs=2 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${max_iter} ${use_recompute} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP1-MP8-PP4.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=8 pp_degree=4 bs_item=16 fp_item=fp16 run_mode=DP1-MP8-PP4 device_num=N4C32 model=gpt micro_bs=4 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP2-MP8-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=2 mp_degree=8 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP8-PP2 device_num=N4C32 model=gpt micro_bs=4 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp16_DP4-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=4 mp_degree=8 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP4-MP8-PP1 device_num=N4C32 model=gpt micro_bs=4 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP1-MP8-PP4.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=1 mp_degree=8 pp_degree=4 bs_item=16 fp_item=fp32 run_mode=DP1-MP8-PP4 device_num=N4C32 model=gpt micro_bs=4 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP2-MP8-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=2 mp_degree=8 pp_degree=2 bs_item=16 fp_item=fp32 run_mode=DP2-MP8-PP2 device_num=N4C32 model=gpt micro_bs=4 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/N4C32/gpt_bs16_fp32_DP4-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt dp_degree=4 mp_degree=8 pp_degree=1 bs_item=16 fp_item=fp32 run_mode=DP4-MP8-PP1 device_num=N4C32 model=gpt micro_bs=4 cd ./benchmarks bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ rm -rf data mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="tokens/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" max_iter=${10:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=0 # (可选) base_batch_size=$global_batch_size use_recompute=${11:-"False"} # (可选)是否打开recompute eval_freq=${12:-"1000"} # (可选)模型评估间隔 sharding_degree=${13:-"1"} # (可选)分组切分并行维度 sharding_stage=${14:-"1"} # (可选)切分策略;1表示仅切分优化器状态,2表示再切分梯度,3表示再切分前向参数 sharding_offload=${15:-"False"} # (可选)CPU offload策略 # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` num_attention_heads=16 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en" num_layers=24 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en" use_pure_fp16=False if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi train_cmd="-o Global.seed=1234 \ -o Global.local_batch_size=${local_batch_size} \ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.hidden_size=1024 \ -o Model.num_layers=${num_layers} \ -o Model.num_attention_heads=${num_attention_heads} \ -o Model.type_vocab_size=1 \ -o Model.use_recompute=${use_recompute} \ -o Distributed.dp_degree=${dp_degree} \ -o Distributed.mp_degree=${mp_degree} \ -o Distributed.pp_degree=${pp_degree} \ -o Distributed.sharding.sharding_degree=${sharding_degree} \ -o Distributed.sharding.sharding_stage=${sharding_stage} \ -o Distributed.sharding.sharding_offload=${sharding_offload} \ -o Optimizer.lr.max_lr=1e-4 \ -o Optimizer.lr.min_lr=1e-5 " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id=0 ;; DP1-MP1-PP4|DP1-MP4-PP1) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id=0 ;; DP8-MP1-PP1|DP1-MP8-PP1|DP1-MP1-PP8|DP1-MP2-PP4|DP1-MP4-PP2|DP2-MP2-PP2| \ DP2-MP8-PP2|DP4-MP8-PP1|DP1-MP8-PP4) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id=0 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 15m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N1C8/gpt_sp_False_bs8_fp16_DP1-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_sp_False dp_degree=1 mp_degree=8 pp_degree=1 bs_item=8 fp_item=fp16 run_mode=DP1-MP8-PP1 device_num=N1C8 sequence_parallel=False model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sequence_parallel} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N1C8/gpt_sp_True_bs8_fp16_DP1-MP8-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_sp_True dp_degree=1 mp_degree=8 pp_degree=1 bs_item=8 fp_item=fp16 run_mode=DP1-MP8-PP1 device_num=N1C8 sequence_parallel=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sequence_parallel} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N4C32/gpt_sp_False_bs16_fp16_DP2-MP8-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_sp_False dp_degree=2 mp_degree=8 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP8-PP2 device_num=N4C32 sequence_parallel=False model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sequence_parallel} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/N4C32/gpt_sp_True_bs16_fp16_DP2-MP8-PP2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_sp_True dp_degree=2 mp_degree=8 pp_degree=2 bs_item=16 fp_item=fp16 run_mode=DP2-MP8-PP2 device_num=N4C32 sequence_parallel=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sequence_parallel} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ rm -rf data mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sequence_parallel/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="tokens/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" sequence_parallel=${10:-"False"} # (可选)是否打开sequence_parallel max_iter=${11:-1000} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 eval_freq=${12:-"1000"} # (可选)模型评估间隔 num_workers=0 # (可选) base_batch_size=$global_batch_size use_recompute=${13:-"True"} # (可选)是否打开recompute sharding_degree=${14:-"1"} # (可选)分组切分并行维度 sharding_stage=${15:-"1"} # (可选)切分策略;1表示仅切分优化器状态,2表示再切分梯度,3表示再切分前向参数 sharding_offload=${16:-"False"} # (可选)CPU offload策略 # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` num_attention_heads=16 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en" num_layers=24 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en" use_pure_fp16=False if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi train_cmd="-o Engine.max_steps=${max_iter} \ -o Engine.eval_iters=${eval_freq} \ -o Distributed.dp_degree=${dp_degree} \ -o Distributed.mp_degree=${mp_degree} \ -o Distributed.pp_degree=${pp_degree} \ -o Distributed.sharding.sharding_degree=${sharding_degree} \ -o Distributed.sharding.sharding_stage=${sharding_stage} \ -o Distributed.sharding.sharding_offload=${sharding_offload} \ -o Model.sequence_parallel=${sequence_parallel} \ -o Distributed.sharding.reduce_overlap=False \ -o Distributed.sharding.broadcast_overlap=False \ -o Optimizer.tensor_fusion=False " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id=0 ;; DP1-MP8-PP1) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id=0 ;; DP2-MP8-PP2) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \ ${train_cmd}" workerlog_id=0 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 60m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage2_bs16_fp16_DP1-MP1-PP1-Sharding2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_stage2 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP1-MP1-PP1-Sharding2 device_num=N1C2 sharding_degree=2 sharding_stage=2 sharding_offload=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage3_bs16_fp16_DP1-MP1-PP1-Sharding2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_stage3 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp16 run_mode=DP1-MP1-PP1-Sharding2 device_num=N1C2 sharding_degree=2 sharding_stage=3 sharding_offload=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N1C2/gpt_stage3_bs16_fp32_DP1-MP1-PP1-Sharding2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_stage3 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=16 fp_item=fp32 run_mode=DP1-MP1-PP1-Sharding2 device_num=N1C2 sharding_degree=2 sharding_stage=3 sharding_offload=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sharding_degree} ${sharding_stage} ${sharding_offload} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sharding/N2C16/gpt_stage2_bs128_fp16_DP1-MP1-PP1-Sharding16.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_stage2 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=128 fp_item=fp16 run_mode=DP1-MP1-PP1-Sharding16 device_num=N2C16 sharding_degree=16 sharding_stage=2 sharding_offload=True max_iter=30 model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${sharding_degree} ${sharding_stage} ${sharding_offload} ${max_iter} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ rm -rf data mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ================================================ FILE: benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="tokens/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" sharding_degree=${10:-"1"} # (可选)分组切分并行维度 sharding_stage=${11:-"1"} # (可选)切分策略;1表示仅切分优化器状态,2表示再切分梯度,3表示再切分前向参数 sharding_offload=${12:-"False"} # (可选)CPU offload策略 max_iter=${13:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 eval_freq=${14:-"1000"} # (可选)模型评估间隔 num_workers=0 # (可选) base_batch_size=$global_batch_size use_recompute=${15:-"True"} # (可选)是否打开recompute # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` use_pure_fp16=False if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi train_cmd="-o Global.local_batch_size=${local_batch_size} \ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.use_recompute=${use_recompute} \ -o Distributed.dp_degree=${dp_degree} \ -o Distributed.mp_degree=${mp_degree} \ -o Distributed.pp_degree=${pp_degree} \ -o Distributed.sharding.sharding_degree=${sharding_degree} \ -o Distributed.sharding.sharding_stage=${sharding_stage} \ -o Distributed.sharding.sharding_offload=${sharding_offload} \ " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in DP1-MP1-PP1-Sharding2) echo "run run_mode: DP1-MP1-PP1-Sharding2" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1 ${PADDLE_RANK_OPTION}\ ./tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ -o Global.seed=1234 \ -o Model.hidden_size=1024 \ -o Model.num_layers=4 \ -o Model.num_attention_heads=4 \ -o Model.type_vocab_size=1 \ -o Optimizer.lr.max_lr=1e-4 \ -o Optimizer.lr.min_lr=1e-5 \ ${train_cmd}" workerlog_id=0 ;; DP1-MP1-PP1-Sharding16) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ ./tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \ -o Engine.logging_freq=1 \ ${train_cmd}" workerlog_id=0 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 70m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/gpt/static/auto_parallel/N1C1/gpt_auto_recompute_bs8_fp32_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=gpt_auto_recompute dp_degree=1 mp_degree=1 pp_degree=1 bs_item=8 fp_item=fp32 run_mode=DP1-MP1-PP1 device_num=N1C1 max_iter=500 use_recompute=True model=gpt micro_bs=8 cd ./benchmarks bash ./test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh # run bash ./test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${max_iter} ${use_recompute} 2>&1; ================================================ FILE: benchmarks/test_tipc/gpt/static/auto_parallel/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ rm -rf data mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ================================================ FILE: benchmarks/test_tipc/gpt/static/auto_parallel/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="samples/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" max_iter=${10:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=0 # (可选) base_batch_size=$global_batch_size use_recompute=${11:-"False"} # (可选)是否打开recompute verbose=${12:-"3"} # (可选)是否打印性能数据 logging_freq=${13:-"100000"} # (可选)loss打印频率 sharding_degree=${14:-"1"} # (可选) sharding_stage=${15:-"1"} # (可选)sharding case # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` num_attention_heads=16 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_attention_heads=4; fi #"gpt2-small-en" num_layers=24 #"gpt2-medium-en" if [ ${mp_degree} -lt 8 -a ${pp_degree} -lt 8 ]; then num_layers=4; fi #"gpt2-small-en" use_pure_fp16=False # fp32 if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi train_cmd="-o Global.seed=1234 \ -o Global.local_batch_size=${local_batch_size} \ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=100000 \ -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.hidden_size=1024 \ -o Model.num_layers=${num_layers} \ -o Model.num_attention_heads=${num_attention_heads} \ -o Model.type_vocab_size=1 \ -o Model.use_recompute=${use_recompute} \ -o Distributed.dp_degree=${dp_degree} \ -o Distributed.mp_degree=${mp_degree} \ -o Distributed.pp_degree=${pp_degree} \ -o Distributed.sharding.sharding_degree=${sharding_degree} \ -o Distributed.sharding.sharding_stage=${sharding_stage} \ -o Optimizer.lr.max_lr=1e-4 \ -o Optimizer.lr.min_lr=1e-5 \ -o Engine.verbose=${verbose} \ -o Engine.logging_freq=${logging_freq} " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 ${PADDLE_RANK_OPTION}\ tools/auto.py -c ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id=0 ;; DP2-MP2-PP2) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ tools/auto.py -c ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \ ${train_cmd}" workerlog_id_1=4 workerlog_id_2=6 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 20m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id_1} ${log_file} cp mylog/workerlog.${workerlog_id_2} ${log_file}_2 fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/N1C1/imagen_397M_text2im_64_bs1_fp32_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=imagen_397M_text2im_64 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=1 fp_item=fp32 run_mode=DP1-MP1-PP1 device_num=N1C1 yaml_path=ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml model=imagen micro_bs=1 cd ./benchmarks bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh # run bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/N1C1/imagen_SR256_bs1_fp32_DP1-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=imagen_SR256 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=1 fp_item=fp32 run_mode=DP1-MP1-PP1 device_num=N1C1 yaml_path=ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml model=imagen micro_bs=1 cd ./benchmarks bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh # run bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_2B_text2im_64_bs8_fp32_DP1-Sharding8.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=imagen_2B_text2im_64 dp_degree=1 mp_degree=1 pp_degree=1 bs_item=8 fp_item=fp32 run_mode=DP1-Sharding8 device_num=N1C8 yaml_path=ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml max_iter=1000 sharding_degree=8 sharding_stage=2 model=imagen micro_bs=1 cd ./benchmarks bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh # run bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} ${max_iter} ${sharding_degree} ${sharding_stage} 2>&1; ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_397M_text2im_64_bs8_fp32_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=imagen_397M_text2im_64 dp_degree=8 mp_degree=1 pp_degree=1 bs_item=8 fp_item=fp32 run_mode=DP8-MP1-PP1 device_num=N1C8 yaml_path=ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml model=imagen micro_bs=1 cd ./benchmarks bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh # run bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_SR256_bs8_fp32_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=imagen_SR256 dp_degree=8 mp_degree=1 pp_degree=1 bs_item=8 fp_item=fp32 run_mode=DP8-MP1-PP1 device_num=N1C8 yaml_path=ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml model=imagen micro_bs=1 cd ./benchmarks bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh # run bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/N1C8/imagen_text2im_64_debertav2_bs8_fp32_DP8-MP1-PP1.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model_item=imagen_text2im_64_debertav2 dp_degree=8 mp_degree=1 pp_degree=1 bs_item=8 fp_item=fp32 run_mode=DP8-MP1-PP1 device_num=N1C8 yaml_path=ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml model=imagen micro_bs=1 cd ./benchmarks bash ./test_tipc/imagen/dygraph/benchmark_common/prepare.sh # run bash ./test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \ ${yaml_path} 2>&1; ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/benchmark_common/prepare.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ wget -O projects/imagen/part-00079 https://paddlefleetx.bj.bcebos.com/data/laion400m/part-00079 # T5-11B mkdir -p projects/imagen/t5/t5-11b/ && cd projects/imagen/t5/t5-11b/ wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/config.json wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/spiece.model wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/tokenizer.json wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.0 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.1 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.2 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.3 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.4 cat t5.pd.tar.gz.* |tar -xf - cd - # DeBERTa V2 1.5B mkdir -p projects/imagen/cache/deberta-v-xxlarge && cd projects/imagen/cache/deberta-v-xxlarge wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/config.json wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/spm.model wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/tokenizer_config.json wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.0 wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.1 cat debertav2.pd.tar.gz.* | tar -xf - cd - ================================================ FILE: benchmarks/test_tipc/imagen/dygraph/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 dp_degree=${3:-"1"} # (必选) dp数据并行度 mp_degree=${4:-"1"} # (必选) mp数据并行度 pp_degree=${5:-"1"} # (必选) pp数据并行度 micro_batch_size=${6:-"2"} # (必选) micro_batch_size global_batch_size=${7:-"16"} # (必选)global_batch_size run_mode=${8:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${9:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) yaml_path=${10:-"ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml"} profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="step/s" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="speed:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" max_iter=${11:-1000} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=0 # (可选) base_batch_size=$global_batch_size sharding_degree=${12:-"1"} # (可选) sharding_stage=${13:-"1"} # (可选)sharding case sharding_offload=${14:-"False"} # (可选) # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi local_batch_size=`expr ${global_batch_size} / ${dp_degree} / ${sharding_degree}` train_cmd="-o Engine.max_steps=${max_iter} \ -o Global.local_batch_size=${local_batch_size} \ -o Global.micro_batch_size=${micro_batch_size} \ -o Distributed.dp_degree=${dp_degree} \ -o Distributed.mp_degree=${mp_degree} \ -o Distributed.pp_degree=${pp_degree} \ -o Distributed.sharding.sharding_degree=${sharding_degree} \ -o Distributed.sharding.sharding_stage=${sharding_stage} \ -o Distributed.sharding.sharding_offload=${sharding_offload} \ " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 case ${run_mode} in DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0 \ ${PADDLE_RANK_OPTION} tools/train.py -c ${yaml_path} \ ${train_cmd}" workerlog_id=0 ;; DP8-MP1-PP1|DP1-Sharding8) echo "run run_mode: ${run_mode}" train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 \ ${PADDLE_RANK_OPTION} tools/train.py -c ${yaml_path} \ ${train_cmd}" workerlog_id=0 ;; *) echo "choose run_mode "; exit 1; esac cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 30m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/vit/dygraph/finetune/N1C8/ViT_large_patch16_384_ft_fused_False_bs512_fp16_DP.sh ================================================ model_item=ViT_large_patch16_384_ft_fused_False fp_item=fp16 bs_item=512 run_mode=DP device_num=N1C8 use_fused_attn=False max_iter=1 cd ./benchmarks bash ./test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \ ${use_fused_attn} ${max_iter} 2>&1; ================================================ FILE: benchmarks/test_tipc/vit/dygraph/finetune/N1C8/ViT_large_patch16_384_ft_fused_True_bs512_fp16_DP.sh ================================================ model_item=ViT_large_patch16_384_ft_fused_True fp_item=fp16 bs_item=512 run_mode=DP device_num=N1C8 use_fused_attn=True max_iter=1 cd ./benchmarks bash ./test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh # run bash ./test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \ ${use_fused_attn} ${max_iter} 2>&1; ================================================ FILE: benchmarks/test_tipc/vit/dygraph/finetune/benchmark_common/prepare.sh ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ mkdir dataset && cd dataset cp -r ${BENCHMARK_ROOT}/models_data_cfs/Paddle_distributed/ILSVRC2012.tgz ./ tar -zxf ILSVRC2012.tgz cd - # pretrained mkdir -p pretrained/vit/ wget -O ./pretrained/vit/imagenet21k-ViT-L_16.pdparams \ https://paddle-wheel.bj.bcebos.com/benchmark/imagenet21k-ViT-L_16.pdparams ================================================ FILE: benchmarks/test_tipc/vit/dygraph/finetune/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 global_batch_size=${3:-"128"} # (必选)global_batch_size run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="images/sec" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" use_fused_attn=${6:-"False"} max_iter=${7:-1} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=0 # (可选) base_batch_size=$global_batch_size # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi train_cmd="-o Engine.num_train_epochs=${max_iter} \ -o Model.model.use_fused_attn=${use_fused_attn} \ " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION} \ tools/train.py -c ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml \ ${train_cmd}" workerlog_id=0 cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 15m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: benchmarks/test_tipc/vit/dygraph/pretrained/N2C16/ViT_large_patch16_224_pt_fused_False_bs128_fp16_DP.sh ================================================ model_item=ViT_large_patch16_224_pt_fused_False fp_item=fp16 bs_item=128 run_mode=DP device_num=N2C16 use_fused_attn=False max_iter=1 cd ./benchmarks bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh # run bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \ ${use_fused_attn} ${max_iter} 2>&1; ================================================ FILE: benchmarks/test_tipc/vit/dygraph/pretrained/N2C16/ViT_large_patch16_224_pt_fused_True_bs128_fp16_DP.sh ================================================ model_item=ViT_large_patch16_224_pt_fused_True fp_item=fp16 bs_item=128 run_mode=DP device_num=N2C16 use_fused_attn=True max_iter=1 cd ./benchmarks bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh # run bash ./test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${bs_item} ${run_mode} ${device_num} \ ${use_fused_attn} ${max_iter} 2>&1; ================================================ FILE: benchmarks/test_tipc/vit/dygraph/pretrained/benchmark_common/prepare.sh ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m pip install -r ../requirements.txt # get data cd ../ mkdir dataset && cd dataset cp -r ${BENCHMARK_ROOT}/models_data_cfs/Paddle_distributed/ILSVRC2012.tgz ./ tar -zxf ILSVRC2012.tgz cd - ================================================ FILE: benchmarks/test_tipc/vit/dygraph/pretrained/benchmark_common/run_benchmark.sh ================================================ #!/usr/bin/env bash # Test training benchmark for a model. # Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} function _set_params(){ model_item=${1:-"model_item"} # (必选) 模型 item fp_item=${2:-"fp32"} # (必选) fp32|fp16 global_batch_size=${3:-"128"} # (必选)global_batch_size run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP2-MP8-PP2|DP1-MP8-PP4|DP4-MP8-PP1 device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) yaml_path=${6:-"./task/classification/vit/configs/ViT_base_patch16_224_in1k_1n8c_dp_fp16o2.yaml"} profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 model_repo="PaddleFleetX" # (必选) 模型套件的名字 speed_unit="images/sec" # (必选)速度指标单位 skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" use_fused_attn=${7:-"False"} max_iter=${8:-1} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 num_workers=0 # (可选) base_batch_size=$global_batch_size pretrained_model=${9:-"null"} # 以下为通用执行命令,无特殊可不用修改 model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} # train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed OUTPUT_PATH=${run_log_path}/output } function _train(){ batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs if [ -d $OUTPUT_PATH ]; then rm -rf $OUTPUT_PATH fi mkdir $OUTPUT_PATH echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" if [ ${profiling} = "true" ];then add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" log_file=${profiling_log_file} else add_options="" log_file=${train_log_file} fi train_cmd="-o Engine.num_train_epochs=${max_iter} \ -o Data.Train.sampler.batch_size=${global_batch_size} \ -o Model.model.name=ViT_large_patch16_224 \ -o Model.model.use_fused_attn=${use_fused_attn} " if [ ${PADDLE_TRAINER_ID} ] then PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" else PADDLE_RANK_OPTION="" fi # 以下为通用执行命令,无特殊可不用修改 train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --devices=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION} \ tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml \ ${train_cmd}" workerlog_id=0 cd ../ echo "train_cmd: ${train_cmd} log_file: ${log_file}" if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else timeout 15m ${train_cmd} > ${log_file} 2>&1 fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" else echo -e "${model_name}, SUCCESS" fi #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d mylog ]; then rm ${log_file} cp mylog/workerlog.${workerlog_id} ${log_file} fi } export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ #_train # 如果只产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 ================================================ FILE: codestyle/.gitignore ================================================ *.pyc ================================================ FILE: codestyle/clang_format.hook ================================================ #!/bin/bash set -e readonly VERSION="13.0.0" version=$(clang-format -version) if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then echo "clang-format installation by pip need python version great equal 3.6, please change the default python to higher version." exit 1 fi if ! [[ $version == *"$VERSION"* ]]; then # low version of pip may not have the source of clang-format whl pip install --upgrade pip pip install clang-format==13.0.0 fi clang-format $@ ================================================ FILE: codestyle/copyright.hook ================================================ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals import argparse import io import re import sys import os import datetime COPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.''' def _generate_copyright(comment_mark): copyright=COPYRIGHT.split(os.linesep) header = copyright[0].rstrip() p = re.search('(\d{4})', header).group(0) now = datetime.datetime.now() header = header.replace(p,str(now.year)) ans=[comment_mark + " " + header + os.linesep] for idx, line in enumerate(copyright[1:]): ans.append(comment_mark + " " + line.rstrip() + os.linesep) return ans def _get_comment_mark(path): lang_type=re.compile(r"\.(py|sh)$") if lang_type.search(path) is not None: return "#" lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$") if lang_type.search(path) is not None: return "//" return None RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE) RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE) RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!") def _check_copyright(path): head=[] try: with open(path) as f: head = [next(f) for x in range(4)] except StopIteration: pass for idx, line in enumerate(head): if RE_COPYRIGHT.search(line) is not None: return True return False def generate_copyright(path, comment_mark): original_contents = io.open(path, encoding="utf-8").readlines() head = original_contents[0:4] insert_line_no=0 for i, line in enumerate(head): if RE_ENCODE.search(line) or RE_SHEBANG.search(line): insert_line_no=i+1 copyright = _generate_copyright(comment_mark) if insert_line_no == 0: new_contents = copyright if len(original_contents) > 0 and len(original_contents[0].strip()) != 0: new_contents.append(os.linesep) new_contents.extend(original_contents) else: new_contents=original_contents[0:insert_line_no] new_contents.append(os.linesep) new_contents.extend(copyright) if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0: new_contents.append(os.linesep) new_contents.extend(original_contents[insert_line_no:]) new_contents="".join(new_contents) with io.open(path, 'w') as output_file: output_file.write(new_contents) def main(argv=None): parser = argparse.ArgumentParser( description='Checker for copyright declaration.') parser.add_argument('filenames', nargs='*', help='Filenames to check') args = parser.parse_args(argv) retv = 0 for path in args.filenames: comment_mark = _get_comment_mark(path) if comment_mark is None: print("warning:Unsupported file", path, file=sys.stderr) continue if _check_copyright(path): continue generate_copyright(path, comment_mark) if __name__ == '__main__': exit(main()) ================================================ FILE: codestyle/cpplint_pre_commit.hook ================================================ #!/bin/bash TOTAL_ERRORS=0 readonly VERSION="1.6.0" version=$(cpplint --version) if [[ ! $TRAVIS_BRANCH ]]; then # install cpplint on local machine. if ! [[ $version == *"$VERSION"* ]]; then pip install cpplint==1.6.0 fi # diff files on local machine. files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}') else # diff files between PR and latest commit on Travis CI. branch_ref=$(git rev-parse "$TRAVIS_BRANCH") head_ref=$(git rev-parse HEAD) files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}') fi # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $files; do if [[ $file =~ ^(patches/.*) ]]; then continue; else cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); fi done exit $TOTAL_ERRORS ================================================ FILE: codestyle/docstring_checker.py ================================================ # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """DocstringChecker is used to check python doc string's style.""" import astroid from pylint.checkers import BaseChecker, utils from pylint.interfaces import IAstroidChecker from collections import defaultdict import re def register(linter): """Register checkers.""" linter.register_checker(DocstringChecker(linter)) class Docstring(object): """Docstring class holds the parsed doc string elements. """ def __init__(self): self.d = defaultdict(list) #name->[] self.clear() def clear(self): self.d['Args'] = [] self.d['Examples'] = [] self.d['Returns'] = [] self.d['Raises'] = [] self.args = {} #arg_name->arg_type def get_level(self, string, indent=' '): level = 0 unit_size = len(indent) while string[:unit_size] == indent: string = string[unit_size:] level += 1 return level def parse(self, doc): """parse gets sections from doc Such as Args, Returns, Raises, Examples s Args: doc (string): is the astroid node doc string. Returns: True if doc is parsed successfully. """ self.clear() lines = doc.splitlines() state = ("others", -1) for l in lines: c = l.strip() if len(c) <= 0: continue level = self.get_level(l) if c.startswith("Args:"): state = ("Args", level) elif c.startswith("Returns:"): state = ("Returns", level) elif c.startswith("Raises:"): state = ("Raises", level) elif c.startswith("Examples:"): state = ("Examples", level) else: if level > state[1]: self.d[state[0]].append(c) continue state = ("others", -1) self.d[state[0]].append(c) self._arg_with_type() return True def get_returns(self): return self.d['Returns'] def get_raises(self): return self.d['Raises'] def get_examples(self): return self.d['Examples'] def _arg_with_type(self): for t in self.d['Args']: m = re.search(r'([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t) if m: self.args[m.group(1)] = m.group(2) return self.args class DocstringChecker(BaseChecker): """DosstringChecker is pylint checker to check docstring style. """ __implements__ = (IAstroidChecker, ) POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument' KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument' name = 'doc-string-checker' symbol = "doc-string" priority = -1 msgs = { 'W9001': ('One line doc string on > 1 lines', symbol + "-one-line", 'Used when a short doc string is on multiple lines'), 'W9002': ('Doc string does not end with "." period', symbol + "-end-with", 'Used when a doc string does not end with a period'), 'W9003': ('All args with their types must be mentioned in doc string %s', symbol + "-with-all-args", 'Used when not all arguments are in the doc string '), 'W9005': ('Missing docstring or docstring is too short', symbol + "-missing", 'Add docstring longer >=10'), 'W9006': ('Docstring indent error, use 4 space for indent', symbol + "-indent-error", 'Use 4 space for indent'), 'W9007': ('You should add `Returns` in comments', symbol + "-with-returns", 'There should be a `Returns` section in comments'), 'W9008': ('You should add `Raises` section in comments', symbol + "-with-raises", 'There should be a `Raises` section in comments'), } options = () def visit_functiondef(self, node): """visit_functiondef checks Function node docstring style. Args: node (astroid.node): The visiting node. Returns: True if successful other wise False. """ self.check_doc_string(node) if node.tolineno - node.fromlineno <= 10: return True if not node.doc: return True doc = Docstring() doc.parse(node.doc) self.all_args_in_doc(node, doc) self.with_returns(node, doc) self.with_raises(node, doc) def visit_module(self, node): self.check_doc_string(node) def visit_classdef(self, node): self.check_doc_string(node) def check_doc_string(self, node): self.missing_doc_string(node) self.one_line(node) self.has_period(node) self.indent_style(node) def missing_doc_string(self, node): if node.name.startswith("__") or node.name.startswith("_"): return True if node.tolineno - node.fromlineno <= 10: return True if node.doc is None or len(node.doc) < 10: self.add_message('W9005', node=node, line=node.fromlineno) return False # FIXME(gongwb): give the docstring line-no def indent_style(self, node, indent=4): """indent_style checks docstring's indent style Args: node (astroid.node): The visiting node. indent (int): The default indent of style Returns: True if successful other wise False. """ if node.doc is None: return True doc = node.doc lines = doc.splitlines() line_num = 0 for l in lines: if line_num == 0: continue cur_indent = len(l) - len(l.lstrip()) if cur_indent % indent != 0: self.add_message('W9006', node=node, line=node.fromlineno) return False line_num += 1 return True def one_line(self, node): """one_line checks if docstring (len < 40) is on one line. Args: node (astroid.node): The node visiting. Returns: True if successful otherwise False. """ doc = node.doc if doc is None: return True if len(doc) > 40: return True elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3: return True else: self.add_message('W9001', node=node, line=node.fromlineno) return False return True def has_period(self, node): """has_period checks if one line doc end-with '.' . Args: node (astroid.node): the node is visiting. Returns: True if successful otherwise False. """ if node.doc is None: return True if len(node.doc.splitlines()) > 1: return True if not node.doc.strip().endswith('.'): self.add_message('W9002', node=node, line=node.fromlineno) return False return True def with_raises(self, node, doc): """with_raises checks if one line doc end-with '.' . Args: node (astroid.node): the node is visiting. doc (Docstring): Docstring object. Returns: True if successful otherwise False. """ find = False for t in node.body: if not isinstance(t, astroid.Raise): continue find = True break if not find: return True if len(doc.get_raises()) == 0: self.add_message('W9008', node=node, line=node.fromlineno) return False return True def with_returns(self, node, doc): """with_returns checks if docstring comments what are returned . Args: node (astroid.node): the node is visiting. doc (Docstring): Docstring object. Returns: True if successful otherwise False. """ if node.name.startswith("__") or node.name.startswith("_"): return True find = False for t in node.body: if not isinstance(t, astroid.Return): continue find = True break if not find: return True if len(doc.get_returns()) == 0: self.add_message('W9007', node=node, line=node.fromlineno) return False return True def all_args_in_doc(self, node, doc): """all_args_in_doc checks if arguments are mentioned in doc Args: node (astroid.node): the node is visiting. doc (Docstring): Docstring object Returns: True if successful otherwise False. """ if node.name.startswith("__") or node.name.startswith("_"): return True args = [] for arg in node.args.get_children(): if (not isinstance(arg, astroid.AssignName)) \ or arg.name == "self": continue args.append(arg.name) if len(args) <= 0: return True parsed_args = doc.args args_not_documented = set(args) - set(parsed_args) if len(args) > 0 and len(parsed_args) <= 0: self.add_message( 'W9003', node=node, line=node.fromlineno, args=list(args_not_documented)) return False for t in args: if t not in parsed_args: self.add_message( 'W9003', node=node, line=node.fromlineno, args=[t, ]) return False return True ================================================ FILE: codestyle/pylint_pre_commit.hook ================================================ #!/bin/bash TOTAL_ERRORS=0 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" export PYTHONPATH=$DIR:$PYTHONPATH readonly VERSION="2.12.0" version=$(pylint --version | grep 'pylint') if ! [[ $version == *"$VERSION"* ]]; then pip install pylint==2.12.0 fi # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do pylint --disable=all --load-plugins=docstring_checker \ --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done exit $TOTAL_ERRORS #For now, just warning: #exit 0 Footer ================================================ FILE: codestyle/test_docstring_checker.py ================================================ # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import docstring_checker import pylint.testutils import astroid import pytest import sys class TestDocstring(pylint.testutils.CheckerTestCase): CHECKER_CLASS = docstring_checker.DocstringChecker def test_one_line(self): func_node = astroid.extract_node(''' def test(): """get news. """ if True: return 5 return 5 ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9001' == got[0][0] def test_one_line_1(self): func_node = astroid.extract_node(''' def test(): """get news""" if True: return 5 return 5 ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9002' == got[0][0] def test_args(self): func_node = astroid.extract_node(''' def test(scale, mean): """get news. Args: scale (int): scale is the number. """ mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9003' == got[0][0] def test_missing(self): func_node = astroid.extract_node(''' def test(): mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9005' == got[0][0] def test_indent(self): func_node = astroid.extract_node(''' def test(): """ get get get get get get get get get get get get get get get get. """ pass ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9006' == got[0][0] def test_with_resturns(self): func_node = astroid.extract_node(''' def test(): """get news. Args: scale (int): scale is the number. """ mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale return mean ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9007' == got[0][0] def test_with_raises(self): func_node = astroid.extract_node(''' def test(): """get news. Args: scale (int): scale is the number. """ mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale mean=scale raise ValueError('A very specific bad thing happened.') ''') self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 1 assert 'W9008' == got[0][0] def test_no_message(self): p = ''' def fc(input, size, num_flatten_dims=1, param_attr=None, bias_attr=None, act=None, name=None): """ **Fully Connected Layer** The fully connected layer can take multiple tensors as its inputs. It creates a variable called weights for each input tensor, which represents a fully connected weight matrix from each input unit to each output unit. The fully connected layer multiplies each input tensor with its coresponding weight to produce an output Tensor. If multiple input tensors are given, the results of multiple multiplications will be sumed up. If bias_attr is not None, a bias variable will be created and added to the output. Finally, if activation is not None, it will be applied to the output as well. This process can be formulated as follows: Args: input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of the input tensor(s) is at least 2. size(int): The number of output units in this layer. num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than two dimensions. If this happens, the multidimensional tensor will first be flattened into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) dimensions will be flatten to form the first dimension of the final matrix (height of the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to form the second dimension of the final matrix (width of the matrix). For example, suppose `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable parameters/weights of this layer. bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias of this layer. If it is set to None, no bias will be added to the output units. act (str, default None): Activation to be applied to the output of this layer. name (str, default None): The name of this layer. Returns: A tensor variable storing the transformation result. Raises: ValueError: If rank of the input tensor is less than 2. Examples: .. code-block:: python data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") fc = fluid.layers.fc(input=data, size=1000, act="tanh") """ raise ValueError('A very specific bad thing happened.') size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 size = 1 return size ''' func_node = astroid.extract_node(p) self.checker.visit_functiondef(func_node) got = self.linter.release_messages() assert len(got) == 0 ================================================ FILE: docs/cluster_deployment.md ================================================ ## 集群部署 本文档介绍在集群上使用分布式进行大模型训练的方法,包括在 Kubernetes 上使用 PaddlePaddle 分布式和在云上使用的方法。 ### 1. Kubernetes部署 在 Kubernetes 上部署分布式任务需要安装 [paddle-operator](https://github.com/PaddleFlow/paddle-operator) 。 paddle-operator 通过添加自定义资源类型 (paddlejob) 以及部署 controller 和一系列 Kubernetes 原生组件的方式实现简单定义即可运行 PaddlePaddle 任务的需求。 目前支持运行 ParameterServer (PS) 和 Collective 两种分布式任务,当然也支持运行单节点任务。 **paddle-operator 安装** 安装 paddle-operator 需要有已经安装的 Kubernetes (v1.16+) 集群和 [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) (v1.16+) 工具。 本节所需配置文件和示例可以在 [这里](https://github.com/PaddleFlow/paddle-operator/tree/main/deploy) 找到, 可以通过 *git clone* 或者复制文件内容保存。 ```yaml deploy |-- examples | |-- resnet.yaml | |-- wide_and_deep.yaml | |-- wide_and_deep_podip.yaml | |-- wide_and_deep_service.yaml | `-- wide_and_deep_volcano.yaml |-- v1 | |-- crd.yaml | `-- operator.yaml ``` 执行以下命令, ```shell kubectl create -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/dev/deploy/v1/crd.yaml ``` 或者 ```shell kubectl create -f deploy/v1/crd.yaml ``` 通过以下命令查看是否成功, ```shell kubectl get crd NAME CREATED AT paddlejobs.batch.paddlepaddle.org 2021-02-08T07:43:24Z ``` 执行以下部署命令, ```shell kubectl create -f https://raw.githubusercontent.com/PaddleFlow/paddle-operator/dev/deploy/v1/operator.yaml ``` 或者 ```shell kubectl create -f deploy/v1/operator.yaml ``` 通过以下命令查看部署结果和运行状态, ```shell kubectl -n paddle-system get pods NAME READY STATUS RESTARTS AGE paddle-controller-manager-698dd7b855-n65jr 1/1 Running 0 1m ``` 通过查看 controller 日志以确保运行正常, ```shell kubectl -n paddle-system logs paddle-controller-manager-698dd7b855-n65jr ``` 提交 demo 任务查看效果, ```shell kubectl -n paddle-system create -f deploy/examples/wide_and_deep.yaml ``` 查看 paddlejob 任务状态, pdj 为 paddlejob 的缩写, ```shell kubectl -n paddle-system get pdj NAME STATUS MODE AGE wide-ande-deep-service Completed PS 4m4s ``` 以上信息可以看出:训练任务已经正确完成,该任务为 ps 模式。 可通过 cleanPodPolicy 配置任务完成/失败后的 pod 删除策略,详见任务配置。 训练期间可以通过如下命令查看 pod 状态, ```shell kubectl -n paddle-system get pods ``` **paddlejob 任务提交** 本resnet示例为 Collective 模式,使用 GPU 进行训练,只需要配置 worker,worker 配置中需要声明使用的 GPU 信息。 准备配置文件, ```yaml apiVersion: batch.paddlepaddle.org/v1 kind: PaddleJob metadata: name: resnet spec: cleanPodPolicy: Never worker: replicas: 2 template: spec: containers: - name: paddle image: registry.baidubce.com/paddle-operator/demo-resnet:v1 command: - python args: - "-m" - "paddle.distributed.launch" - "train_fleet.py" volumeMounts: - mountPath: /dev/shm name: dshm resources: limits: nvidia.com/gpu: 1 volumes: - name: dshm emptyDir: medium: Memory ``` 注意: * 这里需要添加 shared memory 挂载以防止缓存出错。 * 本示例采用内置 flower 数据集,程序启动后会进行下载,根据网络环境可能等待较长时间。 提交任务: 使用 kubectl 提交 yaml 配置文件以创建任务, ```shell kubectl -n paddle-system create -f resnet.yaml ``` **卸载** 通过以下命令卸载部署的组件, ```shell kubectl delete -f deploy/v1/crd.yaml -f deploy/v1/operator.yaml ``` *注意:重新安装时,建议先卸载再安装* ### 2. 公有云和私有云部署 在公有云上运行 PaddlePaddle 分布式建议通过选购容器引擎服务的方式,各大云厂商都推出了基于标准 Kubernetes 的云产品,然后根据上节中的教程安装使用即可。 | 云厂商 | 容器引擎 | 链接 | | --- | ---- | -------------------------------------------- | | 百度云 | CCE | https://cloud.baidu.com/product/cce.html | | 阿里云 | ACK | https://help.aliyun.com/product/85222.html | | 华为云 | CCE | https://www.huaweicloud.com/product/cce.html | 更为方便的是使用百度提供的全功能AI开发平台 [BML](https://cloud.baidu.com/product/bml) 来使用,详细的使用方式请参考 [BML文档](https://ai.baidu.com/ai-doc/BML/pkhxhgo5v)。 ================================================ FILE: docs/compression.md ================================================ # 模型压缩 ------------------------------------------------------------------------------------------ ## **简介** PaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法:量化训练(Qutization Aware Training,QAT)、结构化稀疏(Structured Pruning,SP)和知识蒸馏(Knowledge Distillation,KD)。本文会介绍如何在 PaddleFleetX 中使用这些功能,来压缩并且导出压缩后的模型。 ## **特性** - 量化训练:通过将全连接层的矩阵乘计算由 Float 浮点型优化为 INT8 整型来优化推理性能; - 结构化稀疏:通过剪裁全连接层权重的通道数目来优化推理性能; - 知识蒸馏:通过使用高精度的大模型(教师模型)来蒸馏低精度的小模型(学生模型)来提升小模型精度 ## **配置文档** 模型压缩开关通过 Compress 字段控制,预训练的模型参数路径由 pretrained 指定。接下来就是量化训练、结构化稀疏和知识蒸馏各自的技术参数。 ```yaml Compress: pretrained: // 预训练模型参数的保存路径 Quantization: // 量化训练参数 Prune: // 结构化稀疏参数 Distillation: // 知识蒸馏参数 ``` **注意**: 我们正在开发上述三种压缩方法的联合使用,请先单独使用上述各个方法。 ### **量化训练参数** ```yaml Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_preprocess_type: None activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ``` 其中参数说明: | **参数名** | **参数释义** | |-----------------------------|-----------------------------------------| | pretrained | 预训练模型的加载目录,若设置该参数,将在量化之前加载预训练模型;若需要加载量化后参数,将此参数设置为None,直接设置Engine.save_load.ckpt_dir即可 | | enable | 是否开启量化训练 | | weight_quantize_type | weight量化方法, 默认为`channel_wise_abs_max`, 此外还支持`abs_max` | | activation_quantize_type | activation量化方法, 默认为`moving_average_abs_max` | | weight_preprocess_type | weight预处理方法,默认为None,代表不进行预处理;当需要使用`PACT`方法时设置为`PACT` | | activation_preprocess_type | activation预处理方法,默认为None,代表不进行预处理 | | weight_bits | weight量化比特数, 默认为 8 | | activation_bits | activation量化比特数, 默认为 8 | | quantizable_layer_type | 需要量化的算子类型 | | onnx_format | 是否使用新量化格式,默认为False | 更详细的量化训练参数介绍可参考[PaddleSlim动态图量化训练接口介绍](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/dygraph/quanter/qat.rst)。 ### **结构化稀疏参数** ```yaml Compress: pretrained: Prune: enable: True criterion: l1_norm ratio: 0.125 ``` 其中参数说明: | **参数名** | **参数释义** | |-----------------------------|-----------------------------------------| | pretrained | 预训练模型的加载目录 | | enable | 是否开启结构化稀疏训练 | | criterion | 权重的重要性指标,目前支持l1_norm 和 l2_norm| | ratio | 权重稀疏的比例。例如,0.125的意思是12.5%的权重会被稀疏掉 | ================================================ FILE: docs/deployment_faq.md ================================================ ## 环境验证和常见问题 本文为环境问题排查指引,包括环境正确性验证的方法和常见的一些问题解决方法。 ### 1. 单机环境验证 以下验证不区分本机环境和 Docker 环境。 **GPU验证** 当使用 GPU 时,使用 `nvidia-smi` 命令查看环境中 GPU 状态,预期输出如下 ```shell Thu Jul 21 19:32:03 2022 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:3F:00.0 Off | 0 | | N/A 33C P0 40W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 Tesla V100-SXM2... On | 00000000:40:00.0 Off | 0 | | N/A 34C P0 41W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 2 Tesla V100-SXM2... On | 00000000:41:00.0 Off | 0 | | N/A 35C P0 41W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 3 Tesla V100-SXM2... On | 00000000:42:00.0 Off | 0 | | N/A 38C P0 42W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 4 Tesla V100-SXM2... On | 00000000:62:00.0 Off | 0 | | N/A 34C P0 39W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 5 Tesla V100-SXM2... On | 00000000:63:00.0 Off | 0 | | N/A 36C P0 40W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 6 Tesla V100-SXM2... On | 00000000:64:00.0 Off | 0 | | N/A 37C P0 41W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 7 Tesla V100-SXM2... On | 00000000:65:00.0 Off | 0 | | N/A 36C P0 39W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+ ``` 结果中可以看出 * CUDA Version栏显示的是当前环境中的CUDA版本号,此处为11.2。开始使用飞桨前,请先保证此处CUDA Version显示正常。如果CUDA Version栏不显示版本号,则需要添加CUDA相关库的路径到环境变量`LD_LIBRARY_PATH`中,例如执行命令添加 `export LD_LIBRARY_PATH=/usr/lib64/:/usr/local/lib/:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}` 。具体请参考[文档](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)。 * Memory-Usage 列显示的是当前的显存占用值,此处为0MiB,表示当前设备的显存未被占用;GPU-Util 列显示的是当前的GPU利用率,此处为0%,表示当前设备空闲,可以使用。开始使用飞桨前,请保证当前设备显存充足,且利用率处于空闲状态。 * 最后的 Processes 信息表示正在使用设备的进程,Docker 内可能存在不准确的情况,不影响使用。 **PaddlePaddle 安装验证** 首先运行如下命令确保 PaddlePaddle 正确安装 ```shell python -c "import paddle; paddle.utils.run_check()" ``` 预期会有如下输出 ```shell Running verify PaddlePaddle program ... W0720 09:29:22.035640 12791 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2 W0720 09:29:22.040702 12791 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1. PaddlePaddle works well on 1 GPU. W0720 09:29:36.763486 12791 fuse_all_reduce_op_pass.cc:79] Find all_reduce operators: 2. To make the speed faster, some all_reduce ops are fused during training, after fusion, the number of all_reduce ops is 2. PaddlePaddle works well on 8 GPUs. PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now. ``` 表示 PaddlePaddle 已经正确安装。 如果出现以下错误信息请确保 CUDA 安装正确且已根据 CUDA 安装路径正确配置的 LD_LIBRARY_PATH。 例如执行命令添加 `export LD_LIBRARY_PATH=/usr/lib64/:/usr/local/lib/:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}` 。 具体请参考[文档](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)。 ``` You are using GPU version Paddle, but your CUDA device is not set properly. ``` ### 2. 分布式环境验证 如果单机运行正常,但多机分布式运行异常请先根据 [网络问题排查](#31-网络问题排查) 部分排查网络问题再进行以下排查。 请先确保**各个机器**的 PaddlePaddle 环境已经正确安装,然后在等待验证的其中一个节点上运行如下命令 ```shell python -m paddle.distributed.launch run_check ``` > 默认验证 2 机分布式环境,如果需要验证更多机器(例如4个)环境下飞桨分布式是否运行正常,请添加节点数参数 --nnodes,具体命令如下: > > `python -m paddle.distributed.launch --nnodes=4 run_check` 预期输出如下 ```shell LAUNCH INFO 2022-07-20 09:38:33,349 PaddlePaddle Distributed Check begin... LAUNCH INFO 2022-07-20 09:38:33,358 ----------- Configuration ---------------------- LAUNCH INFO 2022-07-20 09:38:33,358 devices: None LAUNCH INFO 2022-07-20 09:38:33,358 elastic_level: -1 LAUNCH INFO 2022-07-20 09:38:33,358 elastic_timeout: 30 LAUNCH INFO 2022-07-20 09:38:33,358 gloo_port: 6767 LAUNCH INFO 2022-07-20 09:38:33,358 host: None LAUNCH INFO 2022-07-20 09:38:33,358 job_id: default LAUNCH INFO 2022-07-20 09:38:33,358 legacy: False LAUNCH INFO 2022-07-20 09:38:33,358 log_dir: log LAUNCH INFO 2022-07-20 09:38:33,358 log_level: ERROR LAUNCH INFO 2022-07-20 09:38:33,358 master: None LAUNCH INFO 2022-07-20 09:38:33,358 max_restart: 3 LAUNCH INFO 2022-07-20 09:38:33,358 nnodes: 2 LAUNCH INFO 2022-07-20 09:38:33,358 nproc_per_node: None LAUNCH INFO 2022-07-20 09:38:33,358 rank: -1 LAUNCH INFO 2022-07-20 09:38:33,358 run_mode: collective LAUNCH INFO 2022-07-20 09:38:33,359 server_num: None LAUNCH INFO 2022-07-20 09:38:33,359 servers: LAUNCH INFO 2022-07-20 09:38:33,359 trainer_num: None LAUNCH INFO 2022-07-20 09:38:33,359 trainers: LAUNCH INFO 2022-07-20 09:38:33,359 training_script: /usr/local/lib/python3.7/dist-packages/paddle/distributed/launch/plugins/test.py LAUNCH INFO 2022-07-20 09:38:33,359 training_script_args: [] LAUNCH INFO 2022-07-20 09:38:33,359 with_gloo: 1 LAUNCH INFO 2022-07-20 09:38:33,359 -------------------------------------------------- LAUNCH INFO 2022-07-20 09:38:33,360 Job: default, mode collective, replicas 2[2:2], elastic False LAUNCH INFO 2022-07-20 09:38:33,367 Waiting peer start... Copy the following command to other nodes to run. -------------------------------------------------------------------------------- python -m paddle.distributed.launch --master 10.10.1.1:49178 run_check -------------------------------------------------------------------------------- ``` > 如果当前安装的 PaddlePaddle 中未包含该工具,请根据上节提示安装 develop 版本进行测试。 根据提示,复制最后的命令(复制机器上个命令的执行结果,以下命令为示例),在其他节点上粘贴执行 ```shell python -m paddle.distributed.launch --master 10.10.1.1:49178 run_check ``` 执行后,如果配置正常则每个节点都会有后续输出 ```shell LAUNCH INFO 2022-07-20 09:46:41,571 Run Pod: xqqbsr, replicas 2, status ready LAUNCH INFO 2022-07-20 09:46:41,601 Watching Pod: xqqbsr, replicas 2, status running Prepare distributed training with 2 nodes 2 cards I0720 09:46:43.583846 13375 tcp_utils.cc:181] The server starts to listen on IP_ANY:14863 I0720 09:46:43.584153 13375 tcp_utils.cc:130] Successfully connected to 10.10.10.1:14863 W0720 09:46:47.089151 13375 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2 W0720 09:46:47.098454 13375 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1. 2022-07-20 09:46:51,333-INFO: [topology.py:187:__init__] HybridParallelInfo: rank_id: 0, mp_degree: 1, sharding_degree: 1, pp_degree: 1, dp_degree: 4, mp_group: [0], sharding_group: [0], pp_group: [0], dp_group: [0, 1, 2, 3], check/clip group: [0] Distributed training start... [Epoch 0, batch 0] loss: 5.10316, acc1: 0.03125, acc5: 0.06250 Distributed training completed I0720 09:46:54.828758 13432 tcp_store.cc:257] receive shutdown event and so quit from MasterDaemon run loop LAUNCH INFO 2022-07-20 09:46:56,617 Pod completed LAUNCH INFO 2022-07-20 09:46:57,085 Exit code 0 ``` 则表示分布式环境配置正常,多机分布式训练可以成功运行。 > 如果其他节点执行命令后各个节点没有后续输出或输出不符合预期请参考 [FAQ](#3-faq) 部分解决。 **实际分布式训练任务验证** 在启动分布式任务前需要确保各个节点上安装好 PaddlePaddle 环境,同步好数据和代码。 例如准备好训练代码 `train.py`,同步至每个训练节点的工作目录。 ```python import numpy as np import paddle from paddle.distributed import fleet from paddle.vision.models import ResNet from paddle.vision.models.resnet import BottleneckBlock from paddle.io import Dataset, BatchSampler, DataLoader base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 epoch = 10 batch_num = 3 batch_size = 32 class_dim = 102 class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples def __getitem__(self, idx): image = np.random.random([3, 224, 224]).astype('float32') label = np.random.randint(0, class_dim - 1, (1, )).astype('int64') return image, label def __len__(self): return self.num_samples def optimizer_setting(parameter_list=None): optimizer = paddle.optimizer.Momentum( learning_rate=base_lr, momentum=momentum_rate, weight_decay=paddle.regularizer.L2Decay(l2_decay), parameters=parameter_list) return optimizer def train_resnet(): fleet.init(is_collective=True) resnet = ResNet(BottleneckBlock, 18, num_classes=class_dim) optimizer = optimizer_setting(parameter_list=resnet.parameters()) optimizer = fleet.distributed_optimizer(optimizer) resnet = fleet.distributed_model(resnet) dataset = RandomDataset(batch_num * batch_size) train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=2) for eop in range(epoch): resnet.train() for batch_id, data in enumerate(train_loader()): img, label = data label.stop_gradient = True out = resnet(img) loss = paddle.nn.functional.cross_entropy(input=out, label=label) avg_loss = paddle.mean(x=loss) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) avg_loss.backward() optimizer.step() resnet.clear_gradients() print("[Epoch %d, batch %d] loss: %.5f, acc1: %.5f, acc5: %.5f" % (eop, batch_id, avg_loss, acc_top1, acc_top5)) if __name__ == '__main__': train_resnet() ``` 启动分布式训练的命令如下, 这个命令需要在每个参与训练的节点上执行(每个节点上的 `--master`都设置为同一个),如节点较多可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。 ```python python -m paddle.distributed.launch --master=10.10.1.1:49178 --nnodes=2 train.py ``` 这里用到了分布式启动最重要的两个参数 - `--nnodes` 为分布式任务的节点个数(一般为参与任务的机器数量),默认为 1 即启动单机任务,也可使用环境变量 PADDLE_NNODES 设置。 - `--master` 为分布式信息同步的主节点地址,ip:port 格式,可以由第一个启动的节点自动打印或者直接由用户设置为参与任务的任意节点 ip 和任意可用端口,也可使用环境变量 PADDLE_MASTER 设置。 > master 支持使用 etcd 服务,当使用 etcd 服务时,需要同时指定任务 id 以避免任务间冲突。具体地,可以通过 --job_id 参数或者设置环境变量 PADDLE_JOB_ID 指定任务id。 启动后,将看到如下日志,首先是配置部分 ```shell LAUNCH INFO 2022-07-20 12:10:15,863 ----------- Configuration ---------------------- LAUNCH INFO 2022-07-20 12:10:15,863 devices: None LAUNCH INFO 2022-07-20 12:10:15,863 elastic_level: -1 LAUNCH INFO 2022-07-20 12:10:15,863 elastic_timeout: 30 LAUNCH INFO 2022-07-20 12:10:15,863 gloo_port: 6767 LAUNCH INFO 2022-07-20 12:10:15,863 host: None LAUNCH INFO 2022-07-20 12:10:15,863 job_id: default LAUNCH INFO 2022-07-20 12:10:15,863 legacy: False LAUNCH INFO 2022-07-20 12:10:15,863 log_dir: log LAUNCH INFO 2022-07-20 12:10:15,863 log_level: INFO LAUNCH INFO 2022-07-20 12:10:15,863 master: 127.0.0.1:8890 LAUNCH INFO 2022-07-20 12:10:15,863 max_restart: 3 LAUNCH INFO 2022-07-20 12:10:15,863 nnodes: 2 LAUNCH INFO 2022-07-20 12:10:15,863 nproc_per_node: None LAUNCH INFO 2022-07-20 12:10:15,863 rank: -1 LAUNCH INFO 2022-07-20 12:10:15,863 run_mode: collective LAUNCH INFO 2022-07-20 12:10:15,863 server_num: None LAUNCH INFO 2022-07-20 12:10:15,863 servers: LAUNCH INFO 2022-07-20 12:10:15,863 trainer_num: None LAUNCH INFO 2022-07-20 12:10:15,863 trainers: LAUNCH INFO 2022-07-20 12:10:15,863 training_script: train.py LAUNCH INFO 2022-07-20 12:10:15,863 training_script_args: [] LAUNCH INFO 2022-07-20 12:10:15,864 with_gloo: 1 LAUNCH INFO 2022-07-20 12:10:15,864 -------------------------------------------------- ``` 这里打印分布式启动时的配置信息, 更多 launch 启动参数和用法请参考 [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/launch_cn.html) 或通过以下命令获得。 ```shell python -m paddle.distributed.launch --help ``` 然后打印的是任务启动相关的信息: ```shell LAUNCH INFO 2022-07-20 12:10:15,864 Job: default, mode collective, replicas 2[2:2], elastic False LAUNCH INFO 2022-07-20 12:10:15,870 Waiting peer start... LAUNCH INFO 2022-07-20 12:10:25,860 Run Pod: bpdjev, replicas 2, status ready LAUNCH INFO 2022-07-20 12:10:25,883 Watching Pod: bpdjev, replicas 2, status running ``` 其中,每行对应的具体含义解释如下: * 因为未设置 job_id,使用默认名称 default,启动的是 collective 模式,总共 2 个节点的分布式任务,不支持弹性(即节点数不可变)。 * 节点短暂处于等待其他节点启动的状态,如果其他节点已启动但日志长期处于等待状态,请根据 [FAQ](#31-网络问题排查) 进行排查。 * 任务准备启动,当前节点名为 bpdjev(该名称为随机生成)处于 ready 状态,当前节点包含 2 个进程(1 个进程对应 1 个 GPU)。 * 节点已启动,正在监控进程健康状态。 至此分布式启动成功,接下来打印业务日志(即用户代码相关输出日志) ```shell I0720 12:10:27.763713 14071 tcp_utils.cc:181] The server starts to listen on IP_ANY:11061 I0720 12:10:27.763914 14071 tcp_utils.cc:130] Successfully connected to 10.10.10.1:11061 W0720 12:10:30.666985 14071 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2 W0720 12:10:30.675815 14071 gpu_resources.cc:91] device: 0, cuDNN Version: 8.1. 2022-07-20 12:10:36,377-INFO: [topology.py:187:**init**] HybridParallelInfo: rank_id: 0, mp_degree: 1, sharding_degree: 1, pp_degree: 1, dp_degree: 4, mp_group: [0], sharding_group: [0], pp_group: [0], dp_group: [0, 1, 2, 3], check/clip group: [0] /usr/local/lib/python3.7/dist-packages/paddle/nn/layer/norm.py:668: UserWarning: When training, we now always track global mean and variance. "When training, we now always track global mean and variance.") [Epoch 0, batch 0] loss: 5.42939, acc1: 0.00000, acc5: 0.00000 [Epoch 0, batch 1] loss: 6.13338, acc1: 0.00000, acc5: 0.03125 [Epoch 0, batch 2] loss: 7.25566, acc1: 0.03125, acc5: 0.06250 // 此处省略多行类似日志 [Epoch 9, batch 0] loss: 7.23511, acc1: 0.00000, acc5: 0.00000 [Epoch 9, batch 1] loss: 4.69053, acc1: 0.03125, acc5: 0.06250 [Epoch 9, batch 2] loss: 5.08652, acc1: 0.00000, acc5: 0.03125 I0720 12:10:53.647085 14112 tcp_store.cc:257] receive shutdown event and so quit from MasterDaemon run loop ``` 至此,训练结束,业务代码结束,最后打印退出日志 ```shell LAUNCH INFO 2022-07-20 12:10:56,915 Pod completed LAUNCH INFO 2022-07-20 12:10:57,388 Exit code 0 ``` 更多日志请在 log 目录下查看,日志文件命名为` {job_id}.{节点名}.{卡号}.log` , 例如如下两个文件为本例子中 2 张卡分别对应的日志。 ```shell -rw-r--r-- 1 root root 2.9K Jul 20 12:10 default.bpdjev.0.log -rw-r--r-- 1 root root 2.7K Jul 20 12:10 default.bpdjev.1.log ``` 当有错误发生时,比如 GPU 卡被占用发生冲突时,会有如下输出 ```shell LAUNCH INFO 2022-07-21 11:58:59,451 Pod failed LAUNCH ERROR 2022-07-21 11:58:59,452 Container failed !!! Container rank 6 status failed cmd ['/usr/bin/python', '-u', 'train.py'] code 1 log log/default.fxemxd.6.log env {'GREP_COLOR': '1;31', 'CUDNN_VERSION': '8.1.1.33', 'LC_ALL': 'en_US.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/lib/python3.7/dist-packages/cv2/../../lib64:/usr/local/cuda-11.2/targets/x86_64-linux/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'LANG': 'en_US.UTF-8', 'HOSTNAME': 'xxxxx', 'OLDPWD': '/home/userhome', 'WITH_GPU': 'ON', 'NVIDIA_VISIBLE_DEVICES': 'all', 'NCCL_VERSION': '2.8.4', 'GOPATH': '/root/gopath', 'PWD': '/home/userhome/workspace/Paddle', 'HOME': '/home/userhome', 'GOROOT': '/usr/local/go', 'CLICOLOR': '1', 'DEBIAN_FRONTEND': 'noninteractive', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'TERM': 'xterm', 'WITH_AVX': 'ON', 'CUDA_VERSION': '11.2.1', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'SHLVL': '1', 'LANGUAGE': 'en_US.UTF-8', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450,driver<451', 'PATH': '/home/cmake-3.16.0-Linux-x86_64/bin:/usr/local/gcc-8.2/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin:/root/gopath/bin:/home/userhome/.fzf/bin', 'PS1': '\\[\\033[1;33m\\]kui \\[\\033[1;37m\\]\\h \\[\\033[1;32m\\]\\w\\[\\033[1;33m\\]$(__git_ps1 " \\[\\033[35m\\]{\\[\\033[36m\\]%s\\[\\033[35m\\]}")\\[\\033[0m\\] ', '_': '/usr/bin/python', 'CUSTOM_DEVICE_ROOT': '', 'OMP_NUM_THREADS': '1', 'QT_QPA_PLATFORM_PLUGIN_PATH': '/usr/local/lib/python3.7/dist-packages/cv2/qt/plugins', 'QT_QPA_FONTDIR': '/usr/local/lib/python3.7/dist-packages/cv2/qt/fonts', 'runtime_include_dir': '/usr/local/lib/python3.7/dist-packages/paddle/libs', 'POD_NAME': 'fxemxd', 'PADDLE_MASTER': '10.10.10.1:60216', 'PADDLE_GLOBAL_SIZE': '10', 'PADDLE_LOCAL_SIZE': '8', 'PADDLE_GLOBAL_RANK': '8', 'PADDLE_LOCAL_RANK': '6', 'PADDLE_NNODES': '2', 'PADDLE_TRAINER_ENDPOINTS': '10.10.10.1:49825,10.10.10.1:18781,10.10.10.1:53546,10.10.10.1:30837,10.10.10.1:11249,10.10.10.1:13092,10.10.10.1:11398,10.10.10.1:21309,10.10.10.1:47065,10.10.10.1:14834', 'PADDLE_CURRENT_ENDPOINT': '10.10.10.1:47065', 'PADDLE_TRAINER_ID': '8', 'PADDLE_TRAINERS_NUM': '10', 'PADDLE_RANK_IN_NODE': '6', 'FLAGS_selected_gpus': '6'} I0721 11:58:51.079766 29676 tcp_utils.cc:130] Successfully connected to 10.10.10.1:60216 W0721 11:58:54.582710 29676 gpu_resources.cc:61] Please NOTE: device: 6, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2 W0721 11:58:54.590724 29676 gpu_resources.cc:91] device: 6, cuDNN Version: 8.1. Traceback (most recent call last): File "train.py", line 75, in train_resnet() File "train.py", line 39, in train_resnet fleet.init(is_collective=True) File "/usr/local/lib/python3.7/dist-packages/paddle/distributed/fleet/base/fleet_base.py", line 319, in init paddle.distributed.init_parallel_env() File "/usr/local/lib/python3.7/dist-packages/paddle/distributed/parallel.py", line 264, in init_parallel_env paddle.distributed.barrier(group=group) File "/usr/local/lib/python3.7/dist-packages/paddle/distributed/collective.py", line 334, in barrier task = group.process_group.barrier() OSError: (External) NCCL error(5), invalid usage. [Hint: 'ncclInvalidUsage'. The call to NCCL is incorrect. This is usually reflecting a programming error.] (at /paddle/Paddle/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc:214) LAUNCH INFO 2022-07-21 11:59:00,655 Exit code -15 ``` 这当中主要包含以下信息: * 发生错误的提示 Pod failed 和 Container failed !!!. * 错误的卡号(Container rank 6),错误命令和错误环境的环境变量。 * 具体的错误信息 trace,该部分取决于业务代码错误内容。 * 最后打印错误退出码 Exit code -15. 请根据报错信息进行排查,部分错误请参考 [FAQ](#3-faq)。 ### 3. FAQ #### 3.1 网络问题排查 请按照以下步骤排查网络问题 **获取节点IP** 使用命令 `hostname -i` 查看机器 ip,多网卡环境使用 `ifconfig` 命令查看(见上节),获得 IP。 ```shell $ hostname -i 10.10.10.1 ``` 如果这里得到的IP非预期使用的IP或者和日志中打印的IP不相符时,请根据后面小节排查是否是多网卡环境导致使用的网卡不一致。 **确认节点间是否能通过ping连接** 这里举例获得 ip 为 10.10.10.1,在其他节点上使用 `ping 10.10.10.1` 测试机器间是否能连接,有如下输出即为连接成功 ```shell $ ping 10.10.10.1 PING 10.10.10.1 (10.10.10.1) 56(84) bytes of data. 64 bytes from 10.10.10.1: icmp_seq=1 ttl=61 time=0.089 ms 64 bytes from 10.10.10.1: icmp_seq=2 ttl=61 time=0.057 ms 64 bytes from 10.10.10.1: icmp_seq=3 ttl=61 time=0.059 ms 64 bytes from 10.10.10.1: icmp_seq=4 ttl=61 time=0.078 ms 64 bytes from 10.10.10.1: icmp_seq=5 ttl=61 time=0.055 ms ^C --- 10.10.10.1 ping statistics --- 5 packets transmitted, 5 received, 0% packet loss, time 4053ms rtt min/avg/max/mdev = 0.055/0.067/0.089/0.016 ms ``` 长时间无输出或其他输出即无法连接,请联系机器网络管理员处理。 **确认节点间是否能通过HTTP/TCP连接** 在机器 `10.10.10.1`上运行命令 `python -m http.server 8090` 启动 http 服务, ```shell $ python -m http.server 8090 Serving HTTP on 0.0.0.0 port 8090 (http://0.0.0.0:8090/) ... ``` 如果提示端口被占用请使用其他可用端口启动服务,然后在其他的机器上运行命令 `curl 10.10.10.1:8090` ```shell $ curl 10.10.10.1:8090 Directory listing for /

Directory listing for /


  • train.py

  • ``` 有类似以上输出则说明连接成功,否则两台机器间网络可能存在问题,尝试其他端口仍有问题需要联系网络管理员处理。 **确认NCCL是否运行正常** 首先,设置环境变量NCCL_DEBUG,查看NCCL版本和当前使用的IP ```shell export NCCL_DEBUG=INFO python -m paddle.distributed.launch train.py ``` 在输出日志中找到 NCCL 版本信息 ```shell NCCL version 2.8.4+cuda11.2 ``` 确认各个节点的 NCCL 版本相同且高于 2.8。 以及在输出的信息中查找如下信息 ```shell [0] NCCL INFO NET/Socket : Using [0]eth0:10.10.10.1<0> [1] ``` 表示 nccl 使用了名为 `eth0` ip 为 10.10.10.1 的网卡,如果需要使用其他网卡,需要在运行命令前添加环境变量 ```shell export NCCL_SOCKET_IFNAME=eth1 ``` 注意这里添加的时网卡名不是 ip,对应关系参照 `ifconfig` 的输出。 上述测试均正常但是无法跑通分布式环境测试时 请使用 [nccl-test](https://github.com/NVIDIA/nccl-tests) 测试 GPU 通信是否正常。 #### 3.2 多Python环境问题 当工作环境中存在多个版本的 python 时可能存在不一致导致问题。 检查 python 版本 ```shell $ python --version Python 3.7.12 ``` 检查 python 安装目录 ```shell $ which python /usr/bin/python ``` 直接调用绝对路径验证版本 ```shell $ /usr/bin/python --version Python 3.7.12 ``` 如果两次打印的版本不匹配,可以通过使用绝对路径的方式解决。 获取绝对路径需要知道需要安装目录,默认环境中可以通过以下命令查看安装的版本。 ```shell $ ls /usr/bin/python* /usr/bin/python /usr/bin/python2.7 /usr/bin/python3.6 /usr/bin/python3.7 ``` 即当使用 python 时,使用绝对路径 `/usr/bin/python3.7` 替换。 #### 3.3 自动获取 IP 错误(多网卡环境问题) 使用 paddle.distributed.launch 会自动识别使用的 IP,在多网卡配置的环境中自动识别的网卡可能不是预期使用的网卡。 首先可以通过 `ifconfig` 命令查看机器的网卡配置情况,例如 ```shell docker0: flags=4163 mtu 1500 inet 10.0.3.1 netmask 255.255.255.0 broadcast 0.0.0.0 inet6 fe80::7050:1cff:fea2:14f3 prefixlen 64 scopeid 0x20 ether 1e:a6:0d:0d:3b:1e txqueuelen 1000 (Ethernet) RX packets 27201548 bytes 12176726229 (11.3 GiB) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 26762571 bytes 48666409371 (45.3 GiB) TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 lo: flags=73 mtu 65536 inet 127.0.0.1 netmask 255.0.0.0 inet6 ::1 prefixlen 128 scopeid 0x10 loop txqueuelen 1000 (Local Loopback) RX packets 1321339447 bytes 1047567817083 (975.6 GiB) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 1321339447 bytes 1047567817083 (975.6 GiB) TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 eth0: flags=4163 mtu 1500 inet 10.10.10.1 netmask 255.255.255.192 broadcast 10.127.4.191 inet6 f080::5200:4bff:f030:2090 prefixlen 64 scopeid 0x20 ether 50:6b:4b:31:2a:90 txqueuelen 1000 (Ethernet) RX packets 32040749852 bytes 43394575453133 (39.4 TiB) RX errors 0 dropped 391107 overruns 0 frame 0 TX packets 24330967394 bytes 30441950099144 (27.6 TiB) TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 ``` 结果中虽然有3项甚至更多但这里只有一张 ip 为 `10.10.10.1` 网卡(inet值),docker0 为 Docker 虚拟网卡, lo 为本地回路,都不需要关注。 当启动分布式训练命令时,如果飞桨自动识别出的网卡IP不正确时,可以使用--host参数手动配置IP,如 ```python python -m paddle.distributed.launch --master=10.10.10.1:49178 --nnodes=2 --host=10.10.10.1 train.py ``` > 当 --master 地址识别错误时,也需要手动替换。 #### 3.4 机器端口有限制,需要使用固定端口 当集群环境限制通信网卡时需要手动配置所有 ip 和 port 以启动分布式,以机器 `10.10.10.1` 和机器 `10.10.10.2` 必须使用端口 8000-8999 的情况为例, 假设每台机器有两个卡,使用如下脚本设置每个卡对应进程的环境变量,依次启动进程。 ```shell # 所有卡 ip port 列表, ip1:port1,ip2:port2 export PADDLE_TRAINER_ENDPOINTS=10.10.10.1:8000,10.10.10.1:8001,10.10.10.2:8000,10.10.10.2:8001 # 所有卡数 export PADDLE_TRAINERS_NUM=4 # 当前卡 ip:port export PADDLE_CURRENT_ENDPOINT=10.10.10.1:8000 # 当前卡序号 export PADDLE_TRAINER_ID=0 # 当前卡在节点内序号 export PADDLE_RANK_IN_NODE=0 # 当前卡使用的 GPU 卡号 export FLAGS_selected_gpus=0 # 注意,这里不再使用 launch 启动,但本脚本需要运行多次 python train.py ``` 注意在执行时,需要依次替换后面4个环境变量为对应值启动。 #### 3.5 常用的通信问题排查 GPU/NCCL 问题请先核对**版本是否匹配**,通过 `nvidia-smi` 查看是否有进程正在占用,仍有问题需要通过 [nccl-test](https://github.com/NVIDIA/nccl-tests) 测试。常见运行时错误和解决方法如下, **NCCL error(5)** ```shell OSError: (External) NCCL error(5), invalid usage. [Hint: 'ncclInvalidUsage'. The call to NCCL is incorrect. This is usually reflecting a programming error.] ``` 原因和解决方法:该错误多为同一张 GPU 卡被多个进程同时使用导致冲突,请检查正在使用 GPU 的进程。如果需要在同一台机器上启动多个逻辑节点,可以使用 `CUDA_VISIBLE_DEVICES` 环境变量控制设备可见性。 **NCCL error(2)** ```shell ExternalError: Nccl error(2), unhandled system error ``` 原因和解决方法:该错误一般为 shm 设置太小,如果使用 Docker 环境需要在启动 Docker 时做映射和设置如 `--shm-size 32G`. ================================================ FILE: docs/docker_install.md ================================================ ## Docker 环境安装 使用 Docker 首先需要安装 Docker 环境,安装的完整流程请参考[文档](https://docs.docker.com/engine/install/),基础安装流程如下所述。 另外在 Docker 中使用 GPU 还需要安装 [nvida-container-runtime](https://github.com/NVIDIA/nvidia-container-runtime)。 **Ubuntu** 添加 apt 源。 ``` sudo curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add - sudo add-apt-repository "deb [arch=amd64] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable" ``` 软件源升级, 安装docker ``` sudo apt-get update sudo apt-get docker-ce docker-ce-cli containerd.io ``` 使用 `docker version` 查看 docker 版本信息无错误信息即说明安装运行正常。 安装 nvida-container-runtime ``` sudo apt-get install nvidia-container-runtimeb ``` **CentOS** 添加yum源。 ``` sudo wget -O /etc/yum.repos.d/docker-ce.repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo ``` 安装组件。 ``` sudo yum install docker-ce docker-ce-cli containerd.io ``` 启动Docker。 ``` sudo systemctl start docker ``` 查看Docker状态。 ``` sudo systemctl status docker ``` 如日志状态为 active (running) 则表示docker启动正常。 ``` ● docker.service - LSB: start and stop docker Loaded: loaded (/etc/rc.d/init.d/docker; bad; vendor preset: disabled) Active: active (running) since Thu 2022-08-11 20:11:19 CST; 3 days ago Docs: man:systemd-sysv-generator(8) Process: 29766 ExecStop=/etc/rc.d/init.d/docker stop (code=exited, status=0/SUCCESS) Process: 33215 ExecStart=/etc/rc.d/init.d/docker start (code=exited, status=0/SUCCESS) ``` 安装 nvida-container-runtime。 ``` sudo yum install nvidia-container-runtime ``` ================================================ FILE: docs/quick_start.md ================================================ # 快速开始 ## 1. 环境准备 这里介绍使用裸机或者 Docker 环境使用 PaddleFleetX 的方法,用户根据具体情况选择一种安装部署方式即可。 使用多机训练时,需要在每台机器上都部署相应的环境。 ### 1.1 Docker 环境部署 推荐使用 Docker 安装部署 PaddleFleetX 进行大模型训练,Docker 环境的安装可以参考[文档](docker_install.md)。 请根据本地 CUDA 版本(使用 `nvidia-smi`命令查看)使用以下命令拉取对应或兼容的镜像, ``` docker pull registry.baidubce.com/ppfleetx/fleetx-cuda11.2-cudnn8:dev ``` 如本地环境cuda版本较低可以参考 Dockerfile 根据需要定制镜像。 大模型训练需要使用GPU,如已安装 nvida-container-runtime 可以使用以下命令运行镜像, ``` docker run -it --name=paddle --net=host -v /dev/shm:/dev/shm --shm-size=32G -v $PWD:/paddle --runtime=nvidia registry.baidubce.com/ppfleetx/ppfleetx-cuda11.2-cudnn8:v0.1.0 bash ``` 未安装 nvida-container-runtime 或启动后无法执行 `nvidia-smi` 查看GPU信息时可以尝试通过如下脚本启动运行, ```shell export CUDA_SO="$(\ls /usr/lib64/libcuda* | grep -v : | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | grep -v : | xargs -I{} echo '-v {}:{}')" export DEVICES=$(find /dev/nvidia* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}') nvsmi=`which nvidia-smi` docker run \ ${CUDA_SO} ${DEVICES} \ -v /dev/shm:/dev/shm \ -v $PWD:/paddle \ --name paddle \ --net=host \ --shm-size=32G \ -v $nvsmi:$nvsmi \ -it \ registry.baidubce.com/ppfleetx/ppfleetx-cuda11.2-cudnn8:v0.1.0 \ bash ``` 以上命令 `-v $PWD:/paddle` 将当前目录映射到 /paddle 目录,在 docker 环境内部对该目录的更改将会持久化。 > 为保证通信效率和通信正常,添加参数 --net=host 使用主机网络,更多 docker run 参数说明请参考 [docker 文档](https://docs.docker.com/engine/reference/commandline/run/)。 ### 1.2 裸机部署 **安装 PaddlePaddle** 首先根据环境在 [安装文档](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html) 选择对应的版本使用 pip install 执行对应命令安装 PaddlePaddle. **请务必按照文档安装 GPU 版本且验证安装成功**。 例如使用如下命令将会安装基于 CUDA 11.2 最新版本的 PaddlePaddle. ```shell python -m pip install paddlepaddle-gpu==0.0.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` 安装遇到问题以及环境验证的方法也可以参考[文档](deployment_faq.md#1-单机环境验证)。 **安装依赖** 使用以下命令安装 PaddleFleetX 运行所需依赖。 ```shell python -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple ``` ## 2. 模型训练 进入环境后首先使用以下命令拉取最新代码 ```shell git clone https://github.com/PaddlePaddle/PaddleFleetX.git ``` 然后根据需求选择对应的训练方式。 ### 2.1. 单卡训练 切换工作目录并下载demo数据, ``` mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ``` 然后使用以下命令运行程序, ```shell python ./tools/train.py -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型单卡训练,可将对应yaml文件中的Model-hidden size值改为原来的1/2即可。 **运行日志** ``` [2022-09-21 05:42:26,980] [ INFO] - [train] epoch: 0, batch: 0, loss: 10.999595642, avg_batch_cost: 2.73014 sec, speed: 0.37 step/s, ips_total: 3001 tokens/s, ips: 3001 tokens/s, learning rate: 2.77778e-08 [2022-09-21 05:42:27,492] [ INFO] - [train] epoch: 0, batch: 1, loss: 10.997043610, avg_batch_cost: 0.51164 sec, speed: 1.95 step/s, ips_total: 16011 tokens/s, ips: 16011 tokens/s, learning rate: 4.16667e-08 [2022-09-21 05:42:27,997] [ INFO] - [train] epoch: 0, batch: 2, loss: 10.994422913, avg_batch_cost: 0.50457 sec, speed: 1.98 step/s, ips_total: 16236 tokens/s, ips: 16236 tokens/s, learning rate: 5.55556e-08 [2022-09-21 05:42:28,503] [ INFO] - [train] epoch: 0, batch: 3, loss: 11.005314827, avg_batch_cost: 0.50497 sec, speed: 1.98 step/s, ips_total: 16223 tokens/s, ips: 16223 tokens/s, learning rate: 6.94444e-08 [2022-09-21 05:42:29,009] [ INFO] - [train] epoch: 0, batch: 4, loss: 10.988020897, avg_batch_cost: 0.50480 sec, speed: 1.98 step/s, ips_total: 16228 tokens/s, ips: 16228 tokens/s, learning rate: 8.33333e-08 [2022-09-21 05:42:29,513] [ INFO] - [train] epoch: 0, batch: 5, loss: 10.983006477, avg_batch_cost: 0.50393 sec, speed: 1.98 step/s, ips_total: 16256 tokens/s, ips: 16256 tokens/s, learning rate: 9.72222e-08 [2022-09-21 05:42:30,018] [ INFO] - [train] epoch: 0, batch: 6, loss: 10.988539696, avg_batch_cost: 0.50427 sec, speed: 1.98 step/s, ips_total: 16245 tokens/s, ips: 16245 tokens/s, learning rate: 1.11111e-07 ``` ### 2.2. 单机多卡训练 切换工作目录并下载demo数据, ```shell mkdir data wget -O data/gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O data/gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ``` 然后使用以下命令运行单机多卡程序, ``` python -m paddle.distributed.launch \ ./tools/train.py \ -c ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml ``` 若要在显存容量更小的环境例如 16G 显存下进行GPT模型单机训练,可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: ``` python -m paddle.distributed.launch \ ./tools/train.py -c \ ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml -o Model.hidden_size=1024 ``` > 更多 launch 启动参数和用法请参考 [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/launch_cn.html)。 成功则开始训练过程, ``` LAUNCH INFO 2022-08-15 07:37:38,946 ----------- Configuration ---------------------- LAUNCH INFO 2022-08-15 07:37:38,946 devices: None LAUNCH INFO 2022-08-15 07:37:38,947 elastic_level: -1 LAUNCH INFO 2022-08-15 07:37:38,947 elastic_timeout: 30 LAUNCH INFO 2022-08-15 07:37:38,947 gloo_port: 6767 LAUNCH INFO 2022-08-15 07:37:38,947 host: None LAUNCH INFO 2022-08-15 07:37:38,947 ips: None LAUNCH INFO 2022-08-15 07:37:38,947 job_id: default LAUNCH INFO 2022-08-15 07:37:38,947 legacy: False LAUNCH INFO 2022-08-15 07:37:38,947 log_dir: log LAUNCH INFO 2022-08-15 07:37:38,947 log_level: INFO LAUNCH INFO 2022-08-15 07:37:38,947 master: None LAUNCH INFO 2022-08-15 07:37:38,947 max_restart: 3 LAUNCH INFO 2022-08-15 07:37:38,947 nnodes: 1 LAUNCH INFO 2022-08-15 07:37:38,947 nproc_per_node: None LAUNCH INFO 2022-08-15 07:37:38,947 rank: -1 LAUNCH INFO 2022-08-15 07:37:38,947 run_mode: collective LAUNCH INFO 2022-08-15 07:37:38,947 server_num: None LAUNCH INFO 2022-08-15 07:37:38,947 servers: LAUNCH INFO 2022-08-15 07:37:38,947 start_port: 6070 LAUNCH INFO 2022-08-15 07:37:38,947 trainer_num: None LAUNCH INFO 2022-08-15 07:37:38,947 trainers: LAUNCH INFO 2022-08-15 07:37:38,947 training_script: run_pretrain.py LAUNCH INFO 2022-08-15 07:37:38,947 training_script_args: ['-c', './configs_1.3B_dp8.yaml'] LAUNCH INFO 2022-08-15 07:37:38,947 with_gloo: 1 LAUNCH INFO 2022-08-15 07:37:38,947 -------------------------------------------------- LAUNCH INFO 2022-08-15 07:37:38,948 Job: default, mode collective, replicas 1[1:1], elastic False LAUNCH INFO 2022-08-15 07:37:38,949 Run Pod: vqhbut, replicas 8, status ready LAUNCH INFO 2022-08-15 07:37:39,063 Watching Pod: vqhbut, replicas 8, status running ## 启动配置 [2022-08-15 07:41:23,063] [ INFO] - [train] epoch: 0, batch: 0, loss: 11.255846024, avg_batch_cost: 7.06713 sec, speed: 0.14 step/s, ips_total: 9273 tokens/s, ips: 1159 tokens/s, learning rate: 2.77778e-08 ## 更多训练日志 ``` 如有启动异常请根据[文档](deployment_faq.md#1-单机环境验证)进行工作环境验证,其他问题可参考[FAQ](deployment_faq.md#3-faq)解决。 ## 2.3. 多机多卡训练 使用以下命令进行多机分布式训练,其中 --nnodes 参数为分布式训练机器数量,--master 为训练机器中其中一台机器的IP,运行时需要将命令中示例IP替换为真实的机器IP和任意可用端口,然后在**每个节点**上都运行以下命令, 如果不知道机器IP可以不设置--master参数先在一台机器上启动,然后根据提示复制命令在其他机器上启动即可。 ``` python -m paddle.distributed.launch --master=10.10.10.1:8099 --nnodes=2 \ ./tools/train.py -c \ ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml ``` > 该示例为16卡任务,需要满足总卡数为16的要求。 > 注意这里需要使用单机多卡训练部分的代码和数据。 成功则开始多机训练过程,日志和单机多卡类似,日志异常时请按照[文档](deployment_faq.md#2-分布式环境验证)进行环境验证和问题排查。 若要在显存容量更小的环境例如 16G 显存下进行GPT模型单机训练,可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: ``` python -m paddle.distributed.launch --master=10.10.10.1:8099 --nnodes=2 \ ./tools/train.py -c \ ./ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml -o Model.hidden_size=2048 ``` 更多大模型多机训练内容可见[文档](../projects/gpt/docs/README.md)。 ================================================ FILE: docs/standard.md ================================================ ## 模型接入规范 本文讲述在PaddleFleetX repo接入一个新模型,该如何添加和修改文件,以及相应的规范化流程。 ### 1.PaddleFleetX 介绍 PaddleFleetX是飞桨大模型训练推理一站式工具组件。与Paddle.distributed、Paddle.fleet API的关系如下:
    drawing PaddleFleetX与Paddle的关系
    目前支持的模型列表如下: - GPT ### 2.目录结构 整体的PaddleFleetX的目录结构如下: ```text . ├── benchmarks # benchmark评估结果和示例代码 │   └── README.md ├── Dockerfile ├── docs # 文档 │   ├── cluster_deployment.md │   ├── deployment_faq.md │   ├── docker_install.md │   ├── images │   ├── quick_start.md │   └── standard.md ├── ppfleetx │   ├── configs │   ├── core # 管理模型的组网规范,执行规范 │   ├── data # 数据集下载、预处理脚本 │   ├── models # 模型组网 │   ├── optims # 优化器类定义 │   └── utils ├── projects # 模型脚本,包含GPT模型 │   ├── ernie │   ├── gpt │   ├── imagen │   └── vit ├── README.md ├── requirements.txt ├── tasks │   └── gpt └── tools ├── auto.py ├── eval.py ├── export_model.py ├── inference.py └── train.py ``` ### 3.模型接入方法 根据模型训练的阶段不同,整体分为两个阶段:组网阶段和执行阶段。 #### 3.1 组网阶段 需要不同的分布式策略,它们会调用github/PaddlePaddle/Paddle核心框架里面的分布式高层API(FleetAPI),参考: 需要的并行方式。 - [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html) - [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html ) - [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html) - [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html) #### 3.2 执行阶段 ##### BasicModule 执行阶段采用Engine模块分装,为了能够保证Engine的模块化调用,需要将组网为``BasicModule``的子类,保证其规范化输出。其中``BasicModule``提供了多个统一的函数方法: | **函数名** | **参数释义** | |------------------------------|------------------------| | init | 接受用户的组网参数,实现Module初始化 | | pretreating_batch | 预处理batch数据 | | train_step | 一次完整的训练 | | train_step_end | 一次完整的训练后的操作 | | training_epoch_end | 一次完整的epoch训练后的操作 | | validation_step | 一次完整的验证 | | validation_step_end | 一次完整的验证后的操作 | | validation_epoch_end | 一次完整的epoch验证后的操作 | | test_step | 一次完整的测试 | | test_step_end | 一次完整的测试后的操作 | | configure_optimizers | 配置这次训练的优化器 | ##### EagerEngine ``EagerEngine``将上述函数串联起来,实现底层的执行逻辑对上层的屏蔽,减少冗余代码。 初始化需要传入对应的config配置,其层级配置如下: ```yaml Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | max_steps | 最大训练步数 | | num_train_epochs | 训练的epoch数量 | | accumulate_steps | 梯度累加次数 | | logging_freq | 训练日志打印的频率 | | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | enable | 是否使用混合精度策略进行训练 | | dtype | 混合精度训练数据类型使用float16还是bfloat16,默认为float16类型 | | level | 混合精度训练模式,默认``O2``模式 | | scale_loss | 使用fp16混合精度策略下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算 | | save_steps | 保存模型间隔 | | save_epoch | 保存模型epoch间隔 | | output_dir | 指定输出文件 | | ckpt_dir | checkpoint的加载目录 | ``EagerEngine``中重载了多个常用函数,整体的说明如下: | **函数名** | **参数释义** | |------------------------------|------------------------| | fit | 模型训练 | | evaluate | 模型评估 | | predict | 模型预测 | | save | 模型参数保存 | | load | 模型参数加载 | 其中module和engine函数方法的映射关系如下: - fit ``fit``实现模型的训练,EagerEngine的内部调用伪代码如下: ```python module.model.train() for batch in train_dataloader: module.training_step() module.training_step_end() module.optimizer.step() module.lr_scheduler.step() module.optimizer.clear_grad() ``` - evaluate ``evaluate``实现模型的评估,``EagerEngine``的内部调用伪代码如下: ```python with paddle.no_grad(): module.model.eval() for batch in vailidation_dataloader: module.validation_step() module.validation_step_end() ``` - test `` predict``实现模型的预测,``EagerEngine``的内部调用伪代码如下: ```python with paddle.no_grad(): module.model.eval() for batch in test_dataloader: module.predict_step() module.predict_step_end() ``` ### 4.模型接入示例 1、构建组网文件,放置在`ppfleex/models`目录下。 ```python class SimpleNet(nn.Layer): def __init__(self): super(SimpleNet, self).__init__() self.fc1 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE) self.fc2 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE) self.fc3 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE) self.fc4 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE) self.fc5 = nn.Linear(IMAGE_SIZE, CLASS_NUM) def forward(self, image, label=None): output = self.fc1(image) output = self.fc2(output) output = self.fc3(output) output = self.fc4(output) return self.fc5(output) class LossLayer(nn.Layer): def __init__(self): super(LossLayer, self).__init__() def forward(self, image, label=None): return F.cross_entropy(image, label) ``` 2、构建BasicModule,设置符合要求的组网形式,放置在`ppfleetx/models`目录下;并引入`ppfleetx/models/__init__.py` ```python class TestModule(BasicModule): def __init__(self): super().__init__() self.loss_fn = LossLayer() def get_model(self): model = SimpleNet() return model def forward(self, x): return self.model(x) def training_step(self, batch): x, y = batch loss = self.loss_fn(self(x), y) return loss def training_step_end(self, log_dict): logger.info( "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'])) def validation_step(self, batch): x, y = batch loss = self.loss_fn(self(x), y) return loss def validation_step_end(self, log_dict): logger.info( "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['eval_cost'])) def test_step(self, batch): x, y = batch loss = self.loss_fn(self(x), y) return loss def test_step_end(self, log_dict): logger.info( "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost'])) ``` 3、通过config配置Dataset Dataset可以通过config文件进行配置。新增Dataset类型放置在 `ppfleetx/data/dataset`,同时其构造参数于其对应的Dataset字段一致。比如: ```python class GPTDataset(paddle.io.Dataset): def __init__(self, input_dir, split, max_seq_len, num_samples, mode, seed=1234): ``` 对应config中的yaml字段: ```yaml Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: DistributedBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn ``` 4、通过config配置Optimizer和LR ```yaml Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False ``` 5、运行模型相关的配置文件以及相应的运行脚本,放置在[projects](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/projects)目录。 ### 5.模型推理示例 模型训练完成后,可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。 总共分为两个步骤:模型导出和推理部署。可以参考[GPT的模型推理](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/docs/inference.md)。 ================================================ FILE: examples/transformer/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: examples/transformer/models/GPT/docs/README.md ================================================ # GPT ## 模型介绍 GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)/[3](https://arxiv.org/pdf/2005.14165.pdf) 是以[Transformer](https://arxiv.org/abs/1706.03762) 解码器为网络基本组件,使用自回归的方式在大规模无标注文本语料上进行预训练得到的语言生成模型。 本项目是语言模型 GPT 的 PaddlePaddle 大模型实现。目前,PaddleFleetX 提供了 [GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型文件;分别基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl) 和 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) 数据集,采用 ACC(accuracy) 和 PPL(perplexity) 指标后的评估结果如下: | **模型文件** | **ACC** | **PPL** | |---------|-----------|---------------| | GPT-345M | 44.17% | 18.01 | 下面是本例的简要目录结构及说明: ```text . ├── docs # 一站式文档入口 ├── finetune # GLUE 下游任务微调入口 ├── generation # 文本生成体验入口 ├── offline-eval # 模型精度离线评估入口 ├── pretrain # 预训练入口 ``` ## 快速开始 ### 环境依赖 请确保已根据根目录 requirements.txt 安装所需依赖,或者通过以下命令快速安装 ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple ``` ### 数据准备 数据获取和制作详见[GPT 模型预训练数据准备流程](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/gpt) 为了方便用户运行测试本模型,此处提供处理好的300M的训练样本,在单卡训练或混合并行训练前都需要通过以下命令获取数据。 **数据下载命令** ```shell cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下,则忽略 # 下载样例数据 mkdir data && cd data wget -O gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz cd .. # 回到 GPT 目录下 ``` ### 模型训练 除了单卡训练,飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略,减少显存占用、加速训练,达到大模型可训练且训得快的效果。在模型训练前,需要根据模型规模选择合适的并行策略。下面分别从单卡训练和混合并行训练两个方面来介绍GPT模型训练的配置文件和启动方式。 - [单卡训练](./single_card.md) - [混合并行训练](./hybrid_parallel.md) ### 文本生成体验 - [单卡预训练模型文本生成](./single_card.md#GPT-Zero-shot-文本生成) - [混合并行预训练模型文本生成](./hybrid_parallel.md#GPT-Zero-shot-文本生成) ### 模型压缩 - [量化训练](./quantization_aware_training.md) ### 推理部署 - [推理部署](inference.md) ### GLUE 下游任务微调 - [单卡微调](./single_finetune.md) ## 参数释义 ### 全局信息 全局参数指定训练的batch size,以及设备、随机种子等信息;除此之外,模型训练/验证/推理等过程中的必要参数设置也在这里完成。 ```yaml Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | device | 设备信息 | | seed | 随机数种子 | | global_batch_size | 全局的batch size大小,即一次参数更新等效的batch size | | local_batch_size | 每个进程训练的batch size大小 | | micro_batch_size | 每次前向计算的batch size大小 | | max_steps | 最大训练步数 | | num_train_epochs | 训练的epoch数量 | | accumulate_steps | 梯度累加次数 | | logging_freq | 训练日志打印的频率 | | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | | enable | 是否使用混合精度策略进行训练 | | dtype | 混合精度训练数据类型使用float16还是bfloat16,默认为float16类型 | | level | 混合精度训练模式,默认``O2``模式 | | scale_loss | 使用fp16混合精度策略下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算 | | save_steps | 保存模型间隔step数 | | save_epoch | 保存模型间隔epoch数 | | output_dir | 指定输出文件 | | ckpt_dir | checkpoint的加载目录 | ### 模型网络 网络部分完成了网络的组网操作,GPT在[single_model.py](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/dygraph/single_model.py)下。 可以使用配置文件配置模型的规模,如: ```yaml Model: name: "GPT" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: fused_linear: True fuse_attn_qkv: True sequence_parallel: False ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | vocab_size | 训练词表大小 | | hidden_size | 隐藏层大小 | | num_layers | transformer层数 | | num_attention_heads | attention head的数量 | | max_seq_len | 输入文本序列的长度 | | ffn_hidden_size | ffn层大小,一般为隐藏层的四倍 | | attention_probs_dropout_prob | attention中的dropout的失活率 | | max_position_embeddings | position embedding的长度 | | type_vocab_size | 词表类型 | | initializer_range | 参数初始化的范围 | | use_recompute | 是否使用recompute训练 | | recompute_granularity | recompute训练的粒度,可选 `full` `full_attn` `core_attn`,full即recompute全部transformer,full_attn表明只recompute所有self attention部分,core_attn表明只recompute `softmax(qkT)v` 部分。注:显存占用方面,`core_attn` > `full_attn` > `full`,若所选策略产生OOM错误,可以适当更改recompute_granularity | |no_recompute_layers| list of integer,标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐,但是会适当的增加显存。若训练中发现有显存富裕,可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误,可以适当减小不进行recompute的层数。 | | fused_linear | 是否使用fused_linear代替传统Linear加速训练。注:该功能需要cuda 11.6及以上编译的paddle支持。 | | fuse_attn_qkv | 是否对attention层中的qkv计算使用fuse策略以加速训练 | | sequence_parallel | 是否使用序列并行策略以加速训练。注:只有混合并行的GPT才支持该功能,它与张量模型并行共用通信组,当mp_degree=1时,序列并行策略会被强制关闭。 | | virtual_pp_degree | 虚拟流水线并行维度,该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯,所以该参数的推荐值为2。并且,只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时,才可以使用虚拟流水线并行。 | ### 数据集 数据集参数分为“Train”、“Eval”和“Test”三部分,分别对应模型预训练、离线评估、推理等三个模块。 每个模型的配置参数都包含以下内容: ```yaml Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: DistributedBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | dataset.name | 指定自定义数据集的名称 | | input_dir | 指定输入文件,可以使用目录,指定目录时将包括目录中的所有文件 | | split | 训练集,验证集和测试集的切分比例 | | max_seq_len | 输入文本序列的长度 | | sampler.name | 指定自定义采样器的名称 | | shuffle | 是否需要在生成样本下标时打乱顺序 | | drop_last | 是否需要丢弃最后无法凑整一个mini-batch的样本 | | num_workers | 用于加载数据的子进程个数 | | return_list | 每个设备上的数据是否以list形式返回 | | collate_fn | 通过此参数指定如果将样本列表组合为mini-batch数据;支持自定义 | ### 优化器 GPT训练默认使用AdamW优化器以及cosine学习率衰减,这里通过配置文件配置优化器的参数,如: ```yaml Optimizer: name: AdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False ``` 其中参数说明: | **参数名** | **参数释义** | |--------------|---------------------------| | name | 指定自定义优化器的名称 | | weight_decay | weight的衰减率 | | beta1 | 一阶矩估计的指数衰减率 | | beta2 | 二阶矩估计的指数衰减率 | | epsilon | 指定优化器需要优化的参数 | | lr.name | 指定自定义学习率策略的名称 | | decay_steps | 衰减的步长 | | warmup_rate | warmup 率 | | max_lr | Adam 的初始最大学习率 | | min_lr | Adam 的初始最小学习率 | | grad_clip.name | 指定自定义梯度裁剪策略的名称 | | clip_norm | 所允许的范数最大值 | | tensor_fusion | 是否使用tensor_fustion功能加速训练 | 另外,[Profiler](./hybrid_profiler.md)中还介绍了在 GPT 中开启 Profiler 并分析调试分析结果的方法及相关的参数解释。 ### 模型压缩 PaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法:量化训练(Qutization Aware Training,QAT)、结构化稀疏(Structured Pruning,SP)和知识蒸馏(Knowledge Distillation,KD)。详细参数介绍见[模型压缩介绍](https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/docs/compression.md)。 ## 参考文献 - [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) - [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165.pdf) - [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) ================================================ FILE: examples/transformer/models/GPT/docs/hybrid_parallel.md ================================================ # GPT 混合并行模型训练 当训练超大模型时,就必须借助混合并行策略,混合并行策略分别指数据并行、张量模型并行、流水线并行和分组切片并行。其中数据并行保存完整的模型参数并独立处理一份子数据集,以加速模型训练过程;张量模型并行将网络中的张量(Tensor)切分到不同的设备,从而降低单个设备的显存消耗;流水线并行将模型的不同层放置到不同的计算设备,降低单个计算设备的显存消耗;分组切片并行将参数和模型状态划分到不同卡上,每个GPU只保存部分副本,以减少显存占用。联合四种训练方式,可以实现更大模型、更快训练的效果。具体策略以及相关FleetAPI介绍可以参考以下教程: - [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html) - [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html ) - [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html) - [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html) ## 参数释义 ### 并行维度 当前GPT模型已适配3D混合并行,并能够在训练超大模型,用户可以通过配置文件选择并行的维度。 ```yaml Distributed: dp_degree: 2 mp_degree: 2 pp_degree: 2 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ``` 其中参数说明: | **参数名** | **参数释义** | |------------------|--------------------------------------| | dp_degree | 数据并行维度 | | mp_degree | 张量模型并行维度 | | pp_degree | 流水线并行维度 | | sharding_degree | 分组切分并行维度 | | sharding_stage | 切分策略;1表示仅切分优化器状态,2表示再切分梯度,3表示再切分前向参数 | | sharding_offload | CPU offload策略 | |reduce_overlap| 是否在sharding stage 2的模式下进行reduce通讯与反向计算的overlap,该策略暂时不支持sharding_offload| |broadcast_overlap| 是否在sharding stage 2的模式下进行broadcast通讯与下一个batch的 前向计算的overlap,该策略暂时不支持sharding_offload。若使用该模型,在evaluation与save之前,必须调用 `paddle.device.cuda.synchronize()` 方法| ## 运行方式 本目录中按照345M、1.3B、6.7B和175B规模大小,给出32G V100环境下GPT模型混合并行训练的策略配置如下: | 模型规模 | 训练策略 | yaml文件 | |----------|---------------------------|------------------------------| | 345M | fp16+mp8+qat | qat_gpt_345M_mp8.yaml | | 1.3B | fp16+dp8+recompute | pretrain_gpt_1.3B_dp8.yaml | | 6.7B | fp16+sharding16+recompute | pretrain_gpt_6.7B_sharding16.yaml | | 175B | fp16+mp8+pp16+recompute | pretrain_gpt_175B_mp8_pp16.yaml | 若要在显存容量更小的16G V100环境下进行GPT大模型训练,可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。 ### 策略支持 飞桨的混合并行技术包括4个维度:数据并行、张量模型并行、流水线并行和分组切片并行,此外还支持重计算、offload、混合精度、序列并行等策略,来减少显存占用、加速训练。 目前,GPT模型训练已支持前3个维度的任意策略组合,但分组切片并行stage2/3仅支持与数据并行策略组合使用;详见下表。 | | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute | |-----------------|---------------|-----------------|-------------------|-----------|-----------| | sharding stage1 | ✓ | ✓ | ✓ | ✓ | ✓ | | sharding stage2 | ✓ | ㄨ | ㄨ | ✓ | ✓ | | sharding stage3 | ✓ | ㄨ | ㄨ | ✓ | ✓ | ### 单机训练 以单机1.3B模型数据并行训练为例,通过``paddle.distributed.launch``启动多进程训练,该gpt程序需要8卡32G V100以运行。 **启动命令** ```shell cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下,则忽略 log_dir=log_dp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ pretrain/run.py \ -c pretrain/configs/pretrain_gpt_1.3B_dp8.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型单机训练,可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: **启动命令** ```shell log_dir=log_dp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ pretrain/run.py \ -c pretrain/configs/pretrain_gpt_1.3B_dp8.yaml \ -o Model.hidden_size=1024 ``` 每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到;若未指定,日志路径为`log/workerlog.x`。运行日志具体内容如下: **运行日志** ``` [2022-09-21 05:43:58,797] [ INFO] - [train] epoch: 0, batch: 0, loss: 10.992407799, avg_batch_cost: 5.51734 sec, speed: 0.18 step/s, ips_total: 11878 tokens/s, ips: 1485 tokens/s, learning rate: 2.77778e-08 [2022-09-21 05:43:59,508] [ INFO] - [train] epoch: 0, batch: 1, loss: 11.000075340, avg_batch_cost: 0.71029 sec, speed: 1.41 step/s, ips_total: 92267 tokens/s, ips: 11533 tokens/s, learning rate: 4.16667e-08 [2022-09-21 05:44:00,242] [ INFO] - [train] epoch: 0, batch: 2, loss: 11.017463684, avg_batch_cost: 0.73301 sec, speed: 1.36 step/s, ips_total: 89406 tokens/s, ips: 11176 tokens/s, learning rate: 5.55556e-08 [2022-09-21 05:44:00,965] [ INFO] - [train] epoch: 0, batch: 3, loss: 10.983654976, avg_batch_cost: 0.72319 sec, speed: 1.38 step/s, ips_total: 90620 tokens/s, ips: 11328 tokens/s, learning rate: 6.94444e-08 [2022-09-21 05:44:01,678] [ INFO] - [train] epoch: 0, batch: 4, loss: 11.014451981, avg_batch_cost: 0.71223 sec, speed: 1.40 step/s, ips_total: 92016 tokens/s, ips: 11502 tokens/s, learning rate: 8.33333e-08 [2022-09-21 05:44:02,385] [ INFO] - [train] epoch: 0, batch: 5, loss: 11.005180359, avg_batch_cost: 0.70707 sec, speed: 1.41 step/s, ips_total: 92687 tokens/s, ips: 11586 tokens/s, learning rate: 9.72222e-08 [2022-09-21 05:44:03,100] [ INFO] - [train] epoch: 0, batch: 6, loss: 10.989698410, avg_batch_cost: 0.71402 sec, speed: 1.40 step/s, ips_total: 91785 tokens/s, ips: 11473 tokens/s, learning rate: 1.11111e-07 [2022-09-21 05:44:03,806] [ INFO] - [train] epoch: 0, batch: 7, loss: 10.992337227, avg_batch_cost: 0.70554 sec, speed: 1.42 step/s, ips_total: 92888 tokens/s, ips: 11611 tokens/s, learning rate: 1.25000e-07 [2022-09-21 05:44:04,516] [ INFO] - [train] epoch: 0, batch: 8, loss: 10.972790718, avg_batch_cost: 0.71011 sec, speed: 1.41 step/s, ips_total: 92290 tokens/s, ips: 11536 tokens/s, learning rate: 1.38889e-07 [2022-09-21 05:44:05,228] [ INFO] - [train] epoch: 0, batch: 9, loss: 10.983499527, avg_batch_cost: 0.71128 sec, speed: 1.41 step/s, ips_total: 92138 tokens/s, ips: 11517 tokens/s, learning rate: 1.52778e-07 ``` ### 多机训练 若需要在更多机器上进行大模型训练,则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令(master节点ip为训练所用某一台机器的ip即可)。 以2机16卡32G V100上的6.7B模型分组切分并行训练为例,启动命令为: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_sharding16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \ pretrain/run.py -c pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型两机训练,也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_sharding16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" pretrain/run.py \ -c pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml \ -o Model.hidden_size=2048 ``` 若要执行16机175B大模型混合并行训练,以运行启动命令为: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_mp8_pp16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=16 --devices "0,1,2,3,4,5,6,7" pretrain/run.py \ -c pretrain/configs/pretrain_gpt_175B_mp8_pp16.yaml ``` 当节点较多时,可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。 ### 量化训练 若需要对模型进行量化训练,按照以上在配置文件中添加量化参数,可参考`qat_gpt_345M_mp8.yaml`,量化训练时可以可以适当减少训练轮数和学习率。以单机345M模型模型并行训练为例,通过``paddle.distributed.launch``启动多进程训练,该gpt程序需要8卡32G V100以运行,命令如下: ```shell log_dir=log_mp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" pretrain/run.py \ -c pretrain/configs/qat_gpt_345M_mp8.yaml \ -o Global.max_steps=100000 \ -o Optimizer.lr.decay_steps=72000 \ -o Optimizer.lr.max_lr=5.0e-6 \ -o Optimizer.lr.min_lr=1.0e-6 ``` # GPT Zero-shot 文本生成 ## 参数释义 ```yaml Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" ``` 其中参数说明: | **参数名** | **参数释义** | |--------------|---------------------------| | top_k | 每次为采样挑选保留分数最高的 k 个 token | | top_p | 如果设置小于 1.0 的小数,则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0 | | temperature | 调节下一个 token 的概率温度,logits = logits / temperature,默认值为 1.0 | | min_dec_len | 最小生成 token 长度 | | max_dec_len | 最大生成 token 长度 | | num_return_sequences | 每个输入生成的序列个数,默认值为 1 | | decode_strategy | 解码策略,默认值为 "sampling",目前只支持 "sampling",未来会支持 "greedy_search","beam_search" | ## 文本生成 下载预训练好的模型,快速体验文本生成 ```shell cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下,则忽略 mkdir -p ckpt wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/ # --devices 根据并行策略设置设备 python -m paddle.distributed.launch --devices "0" generation/run.py \ -c generation/configs/generation_gpt_345M_dp8.yaml \ -o Global.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/ # 生成的文本,由于 checkpoint 不同,超参不同,随机数不同,您执行可能会生成不一样的内容 Prompt: Hi, GPT2. Tell me who Jack Ma is. Generation: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.” For now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba. Jack Ma on why he never wanted to run for President in 2016: There were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family. On how Alibaba will evolve into a new player in China’s transportation and logistics sector: I think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel. ``` ### 剖析体验文本生成 #### GPT 文本生成模块初始化 ```python module = build_module(cfg) module.model.eval() ``` #### 预训练模型加载 ```python # 获取到预训练 checkpoint 的根目录 ckpt_dir = cfg.Global.save_load.ckpt_dir # 构造出具体路径 model_path = os.path.join(ckpt_dir, "model.pdparams") # 加载模型参数 model_dict = paddle.load(model_path) # FP16 模型参数转成 FP32 模型参数 for key, value in model_dict.items(): model_dict[key] = model_dict[key].astype(paddle.float32) # 设置模型参数为预训练参数 module.model.set_state_dict(model_dict) ``` #### 文本生成与结果展示 ```python input_text = "Historical Records: Tell us about the history of the Great Wall." result = module.generate(input_text) print(f'Prompt: {input_text}') print(f'Generation: {result[0]}') ``` ================================================ FILE: examples/transformer/models/GPT/docs/hybrid_profiler.md ================================================ # Profiler 本文档主要包括在 GPT 中开启 Profiler 并分析调试分析结果的方法,在模型开发中使用 Profiler 分析工具的方法请参考[教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)和[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/profiler/Profiler_cn.html)。 ## 参数配置 使用 Profiler 功能需要在任务配置文件中添加 Profiler 配置信息并确保字段为 `enable: True` 以开启分析器。 完整的可配置参数如下所示,可以根据使用场景调整配置。 ``` Profiler: enable: True scheduler: [1, 5] profiler_log: log_path detailed: True record_shapes: True profile_memory: True summary: overview: True device: True model: True dist: True kernel: True op: True mem: True memcpy: True ``` 其中参数说明: | **参数名** | **参数释义** | **默认值** | |------------------------------|------------------------|------------------------| | enable | 是否开启 Profiler | False | | scheduler | 定义分析区间,如 [1, 5] 记录 step 1 到 step 4 的分析数据 | None | | profiler_log | 日志文件目录 | profiler_log | | detailed | 是否显示详细信息 | False | | record_shapes | 是否记录 tensor shape 相关信息 | True | | profile_memory | 是否统计 memory 相关信息 | True | 其中,当 detailed=True 时会打印所有 summary 表格数据,当 detailed=False 时用户可以根据以下说明定制需要展示的表格信息。 | **参数名** | **参数释义** | **默认值** | |------------------------------|------------------------|------------------------| | summary.overview | 显示每种类型的 Event 时间消耗 | True | | summary.device | 显示 CPU 和 GPU 的平均利用率信息 | False | | summary.model | 显示模型 dataloader、forward、backward、optimization 时间消耗 | True | | summary.dist | 显示计算、通信以及重叠时间 | False | | summary.kernel | 显示 GPU 执行的 kernel 信息 | True | | summary.op | 显示框架中算子 (op) 的执行信息 | True | | summary.mem | 显示内存/显存占用统计信息 | False | | summary.memcpy | 显示框架中调用内存操作所花费的时间 | False | ## 运行分析 本节以 gpt混合并行 为例,首先进入目录, ``` cd PaddleFleetX/examples/transformer/models/GPT # 如果已在此目录下,则忽略 ``` 修改`pretrain/configs/pretrain_gpt_base.yaml` 中 Profiler.enable 为 True, 同时可以根据上节说明调整相关配置,或者使用命令行参数覆盖,例如可以使用以下命令运行程序, ``` python -m paddle.distributed.launch \ ./pretrain/run.py -c \ ./pretrain/configs/pretrain_gpt_1.3B_dp8.yaml -o Profiler.enable=True ``` > 在使用 Profiler 工具进行性能分析时,建议减少 train 的步数,获得分析数据即可停止训练。 ## 结果分析 在训练结束后会有以下数据: * 根据配置信息在控制台打印 summary 表格 * 在配置的 `profiler_log` 目录保存 profiler json 文件 这里保存的 json 文件可以通过如下两种方式查看: * 在 chrome 浏览器中打开 chrome://tracing/,然后打开 json 文件查看 * 根据控制台信息安装并启动 `visualdl --logdir log_path` 然后根据提示在浏览器中**性能分析**模块查看 具体的信息含义解释以及分析方法请参考[文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)。 > 在使用 visualdl 时,如果 log 文件数据较大,启动会比较耗时,请耐心等待。 ## 附录 控制台打印的 summary 信息示例如下所示。 **Overview Summary** ``` ---------------------------------------------Overview Summary--------------------------------------------- Time unit: ms ------------------------- ------------------------- ------------------------- ------------------------- Event Type Calls CPU Time Ratio (%) ------------------------- ------------------------- ------------------------- ------------------------- ProfileStep 4 18591.04 100.00 CudaRuntime 87527 8555.11 46.02 Operator 21912 1883.11 10.13 UserDefined 13116 1841.33 9.90 OperatorInner 33668 1018.39 5.48 Forward 8 731.46 3.93 Backward 4 671.82 3.61 Optimization 4 315.91 1.70 Dataloader 4 1.37 0.01 ------------------------- ------------------------- ------------------------- ------------------------- Calls GPU Time Ratio (%) ------------------------- ------------------------- ------------------------- ------------------------- Kernel 16092 4924.90 26.49 Memcpy 4278 3617.26 19.46 Memset 780 2.31 0.01 Communication 192 2363.13 12.71 ------------------------- ------------------------- ------------------------- ------------------------- ``` **Model Summary** ``` -----------------------------------------------------Model Summary----------------------------------------------------- Time unit: ms --------------- ------ ----------------------------------------------- --------------------------------------------- Name Calls CPU Total / Avg / Max / Min / Ratio(%) GPU Total / Avg / Max / Min / Ratio(%) --------------- ------ ----------------------------------------------- --------------------------------------------- ProfileStep 4 18591.04 / 4647.76 / 14114.47 / 757.27 / 100.00 4924.90 / 1231.22 / 2853.61 / 682.04 / 100.00 Dataloader 4 1.37 / 0.34 / 0.85 / 0.16 / 0.01 0.00 / 0.00 / 0.00 / 0.00 / 0.00 Forward 8 731.46 / 91.43 / 133.28 / 49.03 / 3.93 714.83 / 89.35 / 174.91 / 4.72 / 14.51 Backward 4 671.82 / 167.96 / 168.29 / 167.52 / 3.61 1701.53 / 425.38 / 426.97 / 424.10 / 34.55 Optimization 4 315.91 / 78.98 / 89.07 / 73.78 / 1.70 108.27 / 27.07 / 27.09 / 27.06 / 2.20 Others - 16870.48 / - / - / - / 90.75 2400.27 / - / - / - / 48.74 --------------- ------ ----------------------------------------------- --------------------------------------------- ``` **Operator Summary** ``` ----------------------------------------------------------------Operator Summary----------------------------------------------------------------- Time unit: ms ---------------------------------------------------- ------ ----------------------------------------- ---------------------------------------- Name Calls CPU Total / Avg / Max / Min / Ratio(%) GPU Total / Avg / Max / Min / Ratio(%) ---------------------------------------------------- ------ ----------------------------------------- ---------------------------------------- -----------------------------------------------------------Thread: All threads merged------------------------------------------------------------ GradNodePyLayer_RecomputeFunction_backward 96 663.37 / 6.91 / 17.17 / 4.01 / 18.56 1629.87 / 16.98 / 17.41 / 16.69 / 26.98 TransformerDecoderLayer 96 262.68 / 2.74 / 5.91 / 1.90 / 39.60 661.18 / 6.89 / 7.11 / 6.73 / 40.57 backward 96 318.62 / 3.32 / 10.57 / 1.31 / 48.03 968.69 / 10.09 / 10.31 / 9.91 / 59.43 matmul dygraph 2312 200.13 / 0.09 / 1.61 / 0.04 / 5.60 1487.76 / 0.64 / 9.81 / 0.22 / 24.63 matmul infer_meta 964 1.42 / 0.00 / 0.01 / 0.00 / 0.71 0.00 / 0.00 / 0.00 / 0.00 / 0.00 matmul compute 964 71.38 / 0.07 / 1.59 / 0.03 / 35.67 644.02 / 0.67 / 9.81 / 0.22 / 43.29 MEMSET 192 - / - / - / - / - 0.42 / 0.00 / 0.00 / 0.00 / 0.07 volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn 384 - / - / - / - / - 199.35 / 0.52 / 0.83 / 0.22 / 30.95 volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn 384 - / - / - / - / - 263.96 / 0.69 / 0.79 / 0.59 / 40.99 volta_h884gemm_64x128_ldg8_nn 192 - / - / - / - / - 141.13 / 0.74 / 0.92 / 0.61 / 21.91 void cutlass::Kernel 580 209.08 / 0.36 / 0.97 / 0.06 / 4.25 volta_h884gemm_64x128_ldg8_nn 288 203.89 / 0.71 / 0.92 / 0.57 / 4.14 volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn 384 199.35 / 0.52 / 0.83 / 0.22 / 4.05 volta_h884gemm_256x64_ldg8_tn 288 149.52 / 0.52 / 0.54 / 0.45 / 3.04 void phi::funcs::VectorizedBroadcastKernel 192 122.37 / 0.64 / 0.66 / 0.60 / 2.48 void cutlass::Kernel 100 103.07 / 1.03 / 8.08 / 0.73 / 2.09 void phi::funcs::VectorizedElementwiseKernel 1: raise RuntimeError("Only support single-card finetune for GPT model.") env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.print_config(config) # build dataloader for training/eval dataset = cpn.build_dataset(config.Data.Train.dataset) sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset) train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset, sampler) dataset = cpn.build_dataset(config.Data.Eval.dataset) sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset) valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset, sampler) # build GPT model model, tokenizer, train_loss_fn, eval_loss_fn = impls.build_model(config) if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. model = paddle.amp.decorate(models=model, level='O2') else: scaler = None # build metric model_setting = copy.deepcopy(config.Model) metric_config = model_setting.pop("metric", None) assert metric_config is not None and 'eval' in metric_config if 'train' in metric_config: train_metric = copy.deepcopy(metric_config.train) train_metric_cls = train_metric.pop('name') train_metric = eval("metrics.{}".format(train_metric_cls))( **train_metric) eval_metric = copy.deepcopy(metric_config.eval) eval_metric_cls = eval_metric.pop('name') eval_metric = eval("metrics.{}".format(eval_metric_cls))(**eval_metric) best_metric = 0.0 # build lr and optim config.Optimizer.lr.update({ 'epochs': config.Global.num_train_epochs, 'step_each_epoch': len(train_data_loader), 'total_steps': config.Global.max_steps, }) if 'multi_precision' in config.Optimizer: assert config.Optimizer.pop('multi_precision') \ == config.Global.mix_precision.enable lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr) optimizer = cpn.build_optimizer( config.Optimizer, model, lr_scheduler, multi_precision=config.Global.mix_precision.enable) # call fleet wrapper if nranks > 1: model, optimizer, scaler = strategy.wrap_with_fleet( config.Distributed, model, optimizer, scaler) # load pretrained checkpoints load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1} if config.Global.save_load.ckpt_dir is not None: io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train', load_recovery) # build profiler if config.get('Profiler', {}).get('enable', False): profiler = cpn.build_profiler(config.Profiler) else: profiler = None # start training assert config.Global.get('run_mode', 'epoch') == 'epoch', 'run_mode must be epoch' train_start = log.get_timestamp() if load_recovery['rng_state'] != -1: paddle.set_cuda_rng_state(load_recovery['rng_state']) for epoch_index in range(load_recovery['epoch'], config.Global.num_train_epochs): train_epoch_start = log.get_timestamp() # time count train_losses = [] train_step_start = log.get_timestamp() # Note(GuoxiaWang): Do not use len(train_data_loader()), # it will cause a memory leak. total_train_batch = len(train_data_loader) total_eval_batch = len( valid_data_loader) if valid_data_loader is not None else 0 for step, batch in enumerate(train_data_loader): if epoch_index == load_recovery['epoch']: if step <= load_recovery['step']: continue model.train() fit_kwargs = { "model": model, "scaler": scaler, "optimizer": optimizer, "loss_fn": train_loss_fn, } def forward_func(batch, model, loss_fn): input_ids, labels = batch input_ids.stop_gradient = True labels.stop_gradient = True logits = model(input_ids) loss = loss_fn(logits, labels) return loss loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs) train_losses.append(loss) # training step log if (step + 1) % config.Global.logging_freq == 0: train_step_cost = log.get_timestamp() - train_step_start numpy_losses = [float(loss) for loss in train_losses] train_cost = train_step_cost \ if step == 0 else train_step_cost / config.Global.logging_freq speed = 1. / train_cost default_global_tokens_num = config.Global.global_batch_size * \ config.Data.Train.dataset.max_length ips_total = speed * default_global_tokens_num ips = ips_total / env.get_data_world_size() logger.info( "[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: " \ "%.5f sec, speed: %.2f step/s, ips_total: %.0f tokens/s, ips: %.0f tokens/s" % (epoch_index, config.Global.num_train_epochs, step, total_train_batch, optimizer.get_lr(), sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips)) train_step_start = log.get_timestamp() train_losses = [] if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() # save model/optim states in 'step' mode if step > 0 and config.Global.save_load.save_steps > 0 and \ step % config.Global.save_load.save_steps == 0: device.synchronize() io.save( config.Global.save_load.output_dir, model, optimizer, step=step, epoch=epoch_index, sharding_stage=config.Distributed.sharding.sharding_stage) if profiler: profiler.step() # training epoch log train_epoch_cost = log.get_timestamp() - train_epoch_start logger.info("[Training] epoch: %d, total time: %.5f sec" % (epoch_index, train_epoch_cost)) eval_epoch_start = log.get_timestamp() # start eval in 'epoch' mode eval_step_start = log.get_timestamp() eval_losses = [] total_eval_batch = len(valid_data_loader) for eval_step, batch in enumerate(valid_data_loader): loss = impls.eval_impl(config, batch, model, eval_loss_fn, eval_metric) eval_losses.append(float(loss)) if eval_step % config.Global.logging_freq == 0: eval_step_cost = log.get_timestamp() - eval_step_start speed = 1. / eval_step_cost logger.info( "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (epoch_index, eval_step, sum(eval_losses) / len(eval_losses), eval_step_cost, speed)) eval_step_start = log.get_timestamp() eval_losses = [] eval_epoch_cost = log.get_timestamp() - eval_epoch_start # eval epoch log res = eval_metric.accumulate() eval_metric.reset() if isinstance(eval_metric, metrics.AccuracyAndF1): msg = "acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f" % ( res[0], res[1], res[2], res[3], res[4]) metric = res[4] elif isinstance(eval_metric, metrics.Mcc): msg = "mcc: %.5f" % (res[0]) metric = res[0] elif isinstance(eval_metric, metrics.PearsonAndSpearman): msg = "pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f" % ( res[0], res[1], res[2]) metric = res[2] else: msg = "acc: %.5f" % (res) metric = res if metric > best_metric: best_metric = metric logger.info( "[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f" % (epoch_index, eval_epoch_cost, msg, best_metric)) # save model/optim states in 'epoch' mode if config.Global.save_load.save_epoch > 0 and \ epoch_index % config.Global.save_load.save_steps == 0: device.synchronize() io.save( config.Global.save_load.output_dir, model, optimizer, step=len(train_data_loader), epoch=epoch_index, sharding_stage=config.Distributed.sharding.sharding_stage) # training end log logger.info( "The training process is complete and total cost of time for training is : {}". format( log.convert_timestamp_to_data(log.get_timestamp() - train_start))) if profiler: cpn.profiler_done(profiler, config.Profiler) ================================================ FILE: examples/transformer/models/GPT/finetune/run_task.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0 # Single-Sentence Tasks if [ $1 == "CoLA" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=CoLA \ -o Data.Train.dataset.root=./dataset/cola_public/ \ -o Data.Eval.dataset.name=CoLA \ -o Data.Eval.dataset.root=./dataset/cola_public/ \ -o Data.Eval.dataset.split=dev \ -o Model.metric.train.name=Mcc \ -o Model.metric.eval.name=Mcc \ -o Model.num_classes=2 elif [ $1 == "SST2" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=SST2 \ -o Data.Train.dataset.root=./dataset/SST-2/ \ -o Data.Eval.dataset.name=SST2 \ -o Data.Eval.dataset.root=./dataset/SST-2/ \ -o Data.Eval.dataset.split=dev \ -o Model.num_classes=2 # Similarity and Paraphrase Tasks elif [ $1 == "MRPC" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Global.num_train_epochs=5 \ -o Data.Train.dataset.name=MRPC \ -o Data.Train.dataset.root=./dataset/MRPC/ \ -o Data.Eval.dataset.name=MRPC \ -o Data.Eval.dataset.root=./dataset/MRPC/ \ -o Data.Eval.dataset.split=test \ -o Model.num_classes=2 \ -o Model.metric.train.name=AccuracyAndF1 \ -o Model.metric.eval.name=AccuracyAndF1 elif [ $1 == "QQP" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=QQP \ -o Data.Train.dataset.root=./dataset/QQP/ \ -o Data.Eval.dataset.name=QQP \ -o Data.Eval.dataset.root=./dataset/QQP/ \ -o Data.Eval.dataset.split=dev \ -o Model.num_classes=2 \ -o Model.metric.train.name=AccuracyAndF1 \ -o Model.metric.eval.name=AccuracyAndF1 elif [ $1 == "STSB" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=STSB \ -o Data.Train.dataset.root=./dataset/STS-B/ \ -o Data.Eval.dataset.name=STSB \ -o Data.Eval.dataset.root=./dataset/STS-B/ \ -o Data.Eval.dataset.split=dev \ -o Model.num_classes=1 \ -o Model.metric.train.name=PearsonAndSpearman \ -o Model.metric.eval.name=PearsonAndSpearman \ -o Model.loss.train.name=MSELoss \ -o Model.loss.eval.name=MSELoss # Inference Tasks elif [ $1 == "MNLI" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=MNLI \ -o Data.Train.dataset.root=./dataset/multinli_1.0 \ -o Data.Eval.dataset.name=MNLI \ -o Data.Eval.dataset.root=./dataset/multinli_1.0 \ -o Data.Eval.dataset.split=${2:-"dev_matched"} \ -o Model.num_classes=3 elif [ $1 == "QNLI" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=QNLI \ -o Data.Train.dataset.root=./dataset/QNLI/ \ -o Data.Eval.dataset.name=QNLI \ -o Data.Eval.dataset.root=./dataset/QNLI/ \ -o Data.Eval.dataset.split=dev \ -o Model.num_classes=2 elif [ $1 == "RTE" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Data.Train.dataset.name=RTE \ -o Data.Train.dataset.root=./dataset/RTE/ \ -o Data.Eval.dataset.name=RTE \ -o Data.Eval.dataset.root=./dataset/RTE/ \ -o Data.Eval.dataset.split=dev \ -o Model.num_classes=2 elif [ $1 == "WNLI" ] then python finetune/run.py -c ./finetune/configs/finetune_gpt_345M_single_card_glue.yaml \ -o Global.num_train_epochs=5 \ -o Data.Train.dataset.name=WNLI \ -o Data.Train.dataset.root=./dataset/WNLI/ \ -o Data.Eval.dataset.name=WNLI \ -o Data.Eval.dataset.root=./dataset/WNLI/ \ -o Data.Eval.dataset.split=dev \ -o Model.num_classes=2 else echo "Task name not recognized, please input CoLA, SST2, MRPC, QQP, STSB, MNLI, QNLI, RTE, WNLI." fi ================================================ FILE: examples/transformer/models/GPT/generation/configs/generation_gpt_345M_dp8.yaml ================================================ _base_: ./generation_gpt_base.yaml Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/generation/configs/generation_gpt_345M_single_card.yaml ================================================ _base_: ./generation_gpt_base.yaml Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" ================================================ FILE: examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 8 micro_batch_size: 8 max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: name: "GPT" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: fused_linear: False fuse_attn_qkv: True sequence_parallel: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/generation/configs/generation_pruned_gpt_345M_single_card.yaml ================================================ _base_: ./generation_gpt_base.yaml Compress: Prune: enable: True criterion: l1_norm ratio: 0.125 Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" ================================================ FILE: examples/transformer/models/GPT/generation/configs/generation_qat_gpt_345M_single_card.yaml ================================================ _base_: ./generation_gpt_base.yaml Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" use_topp_sampling: True inference: True Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: examples/transformer/models/GPT/generation/configs/generation_qat_gpt_6.7B_single_card.yaml ================================================ _base_: ./generation_gpt_base.yaml Model: vocab_size: 50304 hidden_size: 1024 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: 16384 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" use_topp_sampling: True inference: True Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: examples/transformer/models/GPT/generation/configs/inference_gpt_345M_dp8.yaml ================================================ _base_: ./generation_gpt_345M_dp8.yaml Inference: model_dir: ./output mp_degree: 1 Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/generation/configs/inference_gpt_345M_single_card.yaml ================================================ _base_: ./generation_gpt_345M_single_card.yaml Inference: model_dir: ./output mp_degree: 1 Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/generation/export.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist from paddle.static import InputSpec __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from ppfleetx.utils.export import export_inference_model from examples.transformer.utils import qat from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=False) paddle.set_device(config.Global.device) # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.process_configs(config) cfg.print_config(config) if config.Global.mix_precision.enable: logger.info("NOTE: disable mix_precision in export mode") # build GPT model model, _ = impls.build_model(config) # export model.eval() input_spec = [ InputSpec( shape=[None, None], name="input_ids", dtype='int64') ] output_dir = config.Global.save_load.output_dir dp_rank = 0 if nranks == 1 else env.get_hcg().get_data_parallel_rank() save_dir = os.path.join(output_dir, "rank_{}".format(dp_rank)) quanter = None quant_mode = False if 'Compress' in config: mode = 'compress' compress_configs = config['Compress'] if "Quantization" in compress_configs: quant_mode = True model, quanter = qat.compress_model(config, model, input_spec) # load pretrained checkpoints if config.Global.save_load.ckpt_dir is not None: io.load( config.Global.save_load.ckpt_dir, model, optimizer=None, mode='export', load_recovery=None) if not quant_mode: export_inference_model(model, input_spec, save_dir, 'model') else: logger.info("export quantized model.") export_inference_model( model, input_spec, save_dir, 'model', export_quant_model=True, quanter=quanter) ================================================ FILE: examples/transformer/models/GPT/generation/impls.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import numpy as np import paddle import paddle.distributed as dist from ppfleetx.utils.log import logger from ppfleetx.distributed.apis import env import ppfleetx.models.language_model.gpt as gpt from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer from examples.transformer.models.GPT.pretrain.impls import fit_impl as pretrain_fit_impl MODEL_CLASSES = { "GPT": (GPTTokenizer, "gpt2"), "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"), } def adjust_length_to_model(length, max_sequence_length): if length < 0 or length > max_sequence_length: length = max_sequence_length return length def build_model(config): nranks = dist.get_world_size() generation_cfgs = config.Generation model_setting = copy.deepcopy(config.Model) if 'Compress' in config and 'Quantization' in config.Compress: quant_setting = copy.deepcopy(config.Compress.Quantization) skip_tensor_map = quant_setting.get('skip_tensor_map', {}) freeze_embedding = quant_setting.get('freeze_embedding', False) model_setting['skip_tensor_map'] = skip_tensor_map model_setting['freeze_embedding'] = freeze_embedding model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] tokenizer = tokenizer_class.from_pretrained(pretrained_name) if nranks == 1: model = gpt.GPTForGeneration( gpt.GPTModel(**model_setting), generation_cfgs) else: assert nranks == config.Distributed.dp_degree, \ "only support single card and data parallel in generation task." model = gpt.GPTForGenerationHybrid( gpt.GPTModelHybrid(**model_setting), generation_cfgs) generation_cfgs['max_dec_len'] = adjust_length_to_model( generation_cfgs['max_dec_len'], 512) generation_cfgs['bos_token_id'] = tokenizer.eos_token_id generation_cfgs['eos_token_id'] = tokenizer.eos_token_id generation_cfgs['pad_token_id'] = tokenizer.eos_token_id return model, tokenizer def left_padding(inputs, pad_id, padding="longest"): assert "input_ids" in inputs, "input_ids should be in inputs!" max_length = 0 for ids in inputs["input_ids"]: max_length = max(max_length, len(ids)) def extend_max_lenth(value, max_length, to_pad_id): return [to_pad_id] * (max_length - len(value)) + value def extend_filed(name, max_length, to_pad_id): values = inputs[name] res = [] for index, value in enumerate(values): res.append(extend_max_lenth(value, max_length, to_pad_id)) inputs[name] = res extend_filed("input_ids", max_length, pad_id) if "attention_mask" in inputs: extend_filed("attention_mask", max_length, 0) if "position_ids" in inputs: extend_filed("position_ids", max_length, 0) return inputs ================================================ FILE: examples/transformer/models/GPT/generation/inference.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from ppfleetx.core.engine import InferenceEngine, TensorRTConfig from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=False) paddle.set_device(config.Global.device) # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.process_configs(config) # build model model, tokenizer = impls.build_model(config) model.eval() if 'Inference' in config: inference_configs = config['Inference'] inference_engine = None else: raise RuntimeError(f'No Inference in config') input_text = 'Hi, GPT2. Tell me who Jack Ma is.' input_ids = [tokenizer.encode(input_text)] if inference_engine is None: # parse TensorRT config tensorrt_config = None if 'TensorRT' in inference_configs: tensorrt_config = TensorRTConfig(**inference_configs['TensorRT']) inference_engine = InferenceEngine(inference_configs['model_dir'], inference_configs['mp_degree'], tensorrt_config) outs = inference_engine.predict([input_ids]) ids = list(outs.values())[0] out_ids = [int(x) for x in ids[0]] result = tokenizer.decode(out_ids) result = input_text + result print('Prompt:', input_text) print('Generation:', result) ================================================ FILE: examples/transformer/models/GPT/generation/run.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist from paddle.static import InputSpec __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from examples.transformer.utils import qat from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=False) paddle.set_device(config.Global.device) # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.process_configs(config) # build model model, tokenizer = impls.build_model(config) if 'Compress' in config: input_spec = [ InputSpec( shape=[None, None], name="input_ids", dtype='int64') ] model, quanter = qat.compress_model(config, model, input_spec) model.eval() cfg.print_config(config) # call fleet wrapper if nranks > 1: model, _, _ = strategy.wrap_with_fleet( config.Distributed, model, optimizer=None, scaler=None) # load pretrained checkpoints if config.Global.save_load.ckpt_dir is not None: io.load( config.Global.save_load.ckpt_dir, model, optimizer=None, mode='generation', load_recovery=None) # build profiler if config.get('Profiler', {}).get('enable', False): profiler = cpn.build_profiler(config.Profiler) else: profiler = None input_text = 'Hi, GPT2. Tell me who Jack Ma is.' input_ids = tokenizer.encode(input_text) inputs = {'input_ids': [input_ids]} inputs = impls.left_padding(inputs, tokenizer.eos_token_id) input_ids = inputs['input_ids'] if len(input_ids) == 0: input_ids = None else: # [1, seq_len] input_ids = paddle.to_tensor(input_ids, dtype='int64') ids, scores = model(input_ids=input_ids) result = [] for i, generated_ids in enumerate(ids): generated_ids = generated_ids.numpy().tolist() # Decode text text = tokenizer.convert_ids_to_string(generated_ids) sequence = input_text + text result.append(sequence) print(f'Prompt: {input_text}') print(f'Generation: {result[0]}') if profiler: cpn.profiler_done(profiler, config.Profiler) ================================================ FILE: examples/transformer/models/GPT/offline-eval/configs/eval_gpt_345M_single_card.yaml ================================================ _base_: ./eval_gpt_base.yaml Offline_Eval: eval_path: ./wikitext-103/wiki.valid.tokens cloze_eval: False overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 8 micro_batch_size: 8 max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: name: "GPT" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: fused_linear: False fuse_attn_qkv: True sequence_parallel: False Data: Eval: dataset: name: LM_Eval_Dataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 overlapping_eval: sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Offline_Eval: eval_path: ./wikitext-103/wiki.valid.tokens cloze_eval: False overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: examples/transformer/models/GPT/offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml ================================================ _base_: ./eval_gpt_base.yaml Model: hidden_dropout_prob: 0.0 attention_probs_dropout_prob: 0.0 Compress: Prune: enable: True criterion: l1_norm ratio: 0.125 Offline_Eval: eval_path: ./lambada_test.jsonl cloze_eval: True overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: examples/transformer/models/GPT/offline-eval/configs/eval_qat_gpt_345M_single_card.yaml ================================================ _base_: ./eval_gpt_base.yaml Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True skip_tensor_map: block_3: ['linear2'] block_5: ['linear1'] block_6: ['linear2'] block_7: ['linear2'] block_10: ['linear2'] block_20: ['linear2'] block_21: ['linear2'] Offline_Eval: eval_path: ./wikitext-103/wiki.valid.tokens cloze_eval: False overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: examples/transformer/models/GPT/offline-eval/impls.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import numpy as np import json import re import math import paddle import paddle.distributed as dist from ppfleetx.utils.log import logger from ppfleetx.distributed.apis import env from ppfleetx.models.language_model import gpt from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer MODEL_CLASSES = { "GPT": (GPTTokenizer, "gpt2"), "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"), } def build_model(config): nranks = dist.get_world_size() model_setting = copy.deepcopy(config.Model) if 'Compress' in config and 'Quantization' in config.Compress: quant_setting = copy.deepcopy(config.Compress.Quantization) model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map', {}) model_setting['freeze_embedding'] = quant_setting.get( 'freeze_embedding', False) model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] tokenizer = tokenizer_class.from_pretrained(pretrained_name) if nranks == 1: model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) else: raise RuntimeError( "Only single-card offline eval is supported in GPTModel now.") return model, tokenizer @paddle.no_grad() def eval_impl(config, batch, model): model.eval() use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list with paddle.amp.auto_cast( use_fp16, custom_black_list=black_list, custom_white_list=white_list, level='O2'): tokens, loss_mask, attention_mask, position_ids, labels = batch preds = model(tokens, position_ids, attention_mask) if not config.Offline_Eval.cloze_eval: masked_lm_loss = paddle.nn.functional.cross_entropy( preds, labels, reduction="none") loss = paddle.sum(masked_lm_loss * loss_mask) return loss else: outputs = paddle.argmax(preds, -1) acc = paddle.cast(outputs == labels, 'float32') acc = paddle.where( paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc)) acc = paddle.sum(paddle.prod(acc, -1)) return acc class LM_Eval_Dataset(paddle.io.Dataset): def __init__(self, tokens, max_seq_len, eos_token_id, overlapping_eval=None, **kwargs): self.tokens = tokens self.seq_len = max_seq_len self.pad_idx = eos_token_id self.overlapping_eval = overlapping_eval if self.overlapping_eval is None: self.overlapping_eval = self.seq_len self.overlapping_eval = max(1, self.overlapping_eval) self.total_targets = len(self.tokens) - 1 # remove first sequence tokens targets = max(self.total_targets - self.overlapping_eval, 0) self.total_sequences = max( math.ceil(targets / self.overlapping_eval) + 1, 1) def __len__(self): return self.total_sequences def _construct_sample(self, tokens): tokens = np.array(tokens).astype("int64").tolist() labels = tokens[1:] tokens = tokens[:-1] seq_length = len(tokens) # attention mask for the attention calulate attention_mask = np.tri(seq_length, seq_length).reshape( (1, seq_length, seq_length)) # the pad and eos tokens do not contribute the loss loss_mask = np.ones(seq_length, dtype="float32") loss_mask[np.where(np.array(tokens) == self.pad_idx)] = 0.0 position_ids = np.arange(0, seq_length, dtype="int64") # -INF mask value as default # attention_mask = (attention_mask - 1.0) * 1e9 # Bool mask of attention attention_mask = attention_mask.astype("float32") return [tokens, loss_mask, attention_mask, position_ids, labels] def __getitem__(self, idx): start_idx = idx * self.overlapping_eval end_idx = start_idx + self.seq_len tokens = self.tokens[start_idx:end_idx + 1] num_tokens = len(tokens) if num_tokens < self.seq_len + 1: num_pad = (self.seq_len + 1 - num_tokens) tokens += [self.pad_idx] * num_pad [tokens, loss_mask, attention_mask, position_ids, labels] = self._construct_sample(tokens) if self.overlapping_eval != self.seq_len and idx != 0: loss_mask[:-self.overlapping_eval] *= 0 return [tokens, loss_mask, attention_mask, position_ids, labels] class Lambada_Eval_Dataset(paddle.io.Dataset): def __init__(self, tokens, labels, max_seq_len, eos_token_id, **kwargs): self.pad_idx = eos_token_id self.seq_len = max_seq_len self.tokens = tokens self.labels = labels def __len__(self): return len(self.tokens) def _construct_sample(self, tokens): tokens = np.array(tokens).astype("int64").tolist() labels = tokens[1:] tokens = tokens[:-1] seq_length = len(tokens) # attention mask for the attention calulate attention_mask = np.tri(seq_length, seq_length).reshape( (1, seq_length, seq_length)) # the pad and eos tokens do not contribute the loss position_ids = np.arange(0, seq_length, dtype="int64") # -INF mask value as default #attention_mask = (attention_mask - 1.0) * 1e9 # Bool mask of attention attention_mask = attention_mask.astype("float32") return [tokens, attention_mask, position_ids, labels] def __getitem__(self, idx): tokens = self.tokens[idx][:self.seq_len] labels = self.labels[idx] tokens = tokens + labels num_tokens = len(tokens) if num_tokens < self.seq_len + 1: num_pad = (self.seq_len + 1 - num_tokens) tokens += [self.pad_idx] * num_pad loss_mask = np.zeros(self.seq_len, dtype="float32") loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1. [tokens, attention_mask, position_ids, labels] = self._construct_sample(tokens) return [tokens, loss_mask, attention_mask, position_ids, labels] def wikitext_detokenizer(string): # contractions string = string.replace("s '", "s'") string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) # number separators string = string.replace(" @-@ ", "-") string = string.replace(" @,@ ", ",") string = string.replace(" @.@ ", ".") # punctuation string = string.replace(" : ", ": ") string = string.replace(" ; ", "; ") string = string.replace(" . ", ". ") string = string.replace(" ! ", "! ") string = string.replace(" ? ", "? ") string = string.replace(" , ", ", ") # double brackets string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) # miscellaneous string = string.replace("= = = =", "====") string = string.replace("= = =", "===") string = string.replace("= =", "==") string = string.replace(" " + chr(176) + " ", chr(176)) string = string.replace(" \n", "\n") string = string.replace("\n ", "\n") string = string.replace(" N ", " 1 ") string = string.replace(" 's", "'s") return string def get_tokens(tokenizer, text, strict=True): if not strict: tokens = tokenizer.encode(text) return tokens[:-1], [tokens[-1]] last_token = text.split()[-1] start_idx = text.rfind(last_token) beginning_tokens = tokenizer.encode(text[:start_idx].strip()) last_token = tokenizer.encode(' ' + last_token) return beginning_tokens, last_token ================================================ FILE: examples/transformer/models/GPT/offline-eval/run.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import json import math import paddle from paddle.distributed import fleet import paddle.distributed as dist from paddle.static import InputSpec __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from ppfleetx.models.language_model import gpt from examples.transformer.utils import qat from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=False) paddle.set_device(config.Global.device) # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) # process configs eval_cfgs = config.Offline_Eval config.Data.Eval.pop("sampler", None) config.Data.Eval.loader.collate_fn = "gpt_collate_fn" config.Data.Eval.loader.batch_size = eval_cfgs.batch_size config.Data.Eval.dataset.input_dir = eval_cfgs.eval_path config.Data.Eval.dataset.max_seq_len = eval_cfgs.max_seq_len config.Global.logging_freq = eval_cfgs.logging_freq if not eval_cfgs.cloze_eval: config.Data.Eval.dataset.name = "LM_Eval_Dataset" config.Data.Eval.dataset.overlapping_eval = eval_cfgs.overlapping_eval else: config.Data.Eval.dataset.name = "Lambada_Eval_Dataset" cfg.print_config(config) # build GPT model model, tokenizer = impls.build_model(config) if 'Compress' in config: input_spec = [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] model, quanter = qat.compress_model(config, model, input_spec) if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. model = paddle.amp.decorate(models=model, level='O2') else: scaler = None # load pretrained checkpoints load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1} if config.Global.save_load.ckpt_dir is not None: io.load( config.Global.save_load.ckpt_dir, model, optimizer=None, mode='eval', load_recovery=load_recovery) # build dataset for eval if not eval_cfgs.cloze_eval: with open(eval_cfgs.eval_path, "rb") as reader: entire_data = reader.read().decode('utf-8') num_original_tokens = len(entire_data.strip().split(" ")) entire_data = impls.wikitext_detokenizer(entire_data) tokenized_data = tokenizer.encode(entire_data) num_tokenized_tokens = len(tokenized_data) print('Original Tokens: %d, Detokenized tokens: %d' % (num_original_tokens, num_tokenized_tokens)) dataset = impls.LM_Eval_Dataset( tokens=tokenized_data, max_seq_len=eval_cfgs.max_seq_len, overlapping_eval=eval_cfgs.overlapping_eval, eos_token_id=tokenizer.eos_token_id) else: tokenized_data = [] tokenized_label = [] with open(eval_cfgs.eval_path, 'r') as f: for line in f.readlines(): text = json.loads(line)['text'] tokens, labels = impls.get_tokens(tokenizer, text) tokenized_data.append(tokens) tokenized_label.append(labels) dataset = impls.Lambada_Eval_Dataset( tokens=tokenized_data, labels=tokenized_label, max_seq_len=eval_cfgs.max_seq_len, eos_token_id=tokenizer.eos_token_id) num_examples = len(dataset) # build dataloader for eval valid_data_loader = cpn.build_dataloader( config.Data.Eval.loader, dataset, batch_sampler=None) # build profiler if config.get('Profiler', {}).get('enable', False): profiler = cpn.build_profiler(config.Profiler) else: profiler = None # start eval model.eval() total_score = 0 score_name = "loss" if not eval_cfgs.cloze_eval else "number correct" eval_start = log.get_timestamp() if load_recovery['rng_state'] != -1: paddle.set_cuda_rng_state(load_recovery['rng_state']) for epoch_index in range(config.Global.num_train_epochs): eval_epoch_start = log.get_timestamp() eval_step_start = log.get_timestamp() eval_losses = [] total_eval_batch = len(valid_data_loader) for eval_step, batch in enumerate(valid_data_loader): loss = impls.eval_impl(config, batch, model) eval_losses.append(float(loss)) if eval_step > 0 and eval_step % config.Global.logging_freq == 0: eval_step_cost = log.get_timestamp() - eval_step_start speed = config.Global.logging_freq / eval_step_cost eval_loss = sum(eval_losses) / len(eval_losses) if not eval_cfgs.cloze_eval: total_score += eval_loss * config.Global.logging_freq / ( num_tokenized_tokens - 1) else: total_score += eval_loss * config.Global.logging_freq logger.info( "[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s" % (epoch_index, eval_step, score_name, total_score, speed)) eval_step_start = log.get_timestamp() eval_losses = [] if eval_step >= config.Global.max_steps: break eval_epoch_cost = log.get_timestamp() - eval_epoch_start logger.info( "[eval] epoch {} : evaluting process is complete and cost {}". format(epoch_index, log.convert_timestamp_to_data( eval_epoch_cost))) string = '[eval] epoch {} : validation results on {} | '.format( epoch_index, eval_cfgs.eval_path) if not eval_cfgs.cloze_eval: total_loss = float(total_score) ppl = math.exp(min(20, total_loss)) token_ratio = (num_tokenized_tokens - 1) / ( num_original_tokens - 1) adjusted_ppl = math.exp(min(20, total_loss * token_ratio)) string += 'avg loss: {:.4E} | '.format(total_loss) string += 'ppl: {:.4E} | '.format(ppl) string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) string += 'token ratio: {} |'.format(token_ratio) else: num_correct = float(total_score) acc = float(num_correct / num_examples) string += 'number correct: {:.4E} | '.format(num_correct) string += 'total examples: {:.4E} | '.format(num_examples) string += 'avg accuracy: {:.4E}'.format(acc) logger.info(string) # evaluting end log logger.info( "The evaluting process is complete and total cost of time for evaluting is : {}". format( log.convert_timestamp_to_data(log.get_timestamp() - eval_start))) del valid_data_loader if profiler: cpn.profiler_done(profiler, config.Profiler) ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/export_qat_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_1.3B_dp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 8 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_1.3B_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_175B_mp8_pp16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 1536 micro_batch_size: 1 Model: vocab_size: 51200 hidden_size: 12288 num_layers: 96 num_attention_heads: 96 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: 'core_attn' no_recompute_layers: virtual_pp_degree: 1 sequence_parallel: True fused_linear: True Distributed: dp_degree: mp_degree: 8 pp_degree: 16 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_6.7B_sharding16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 logging_freq: 10 Model: vocab_size: 50304 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 16 sharding_stage: 2 sharding_offload: False reduce_overlap: True broadcast_overlap: True Optimizer: tensor_fusion: True ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: name: "GPT" fused_linear: False fuse_attn_qkv: True scale_qk_by_layer_num: True sequence_parallel: False no_recompute_layers: vocab_size_divisible_unit: 128 fused_softmax_with_triangular: True Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [969, 30, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Eval: dataset: name: GPTDataset input_dir: ./data/ split: [969, 30, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 use_increments: True grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Distributed: fuse_sequence_parallel_allreduce: False ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_cn_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: name: "GPT-cn" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/prune_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.0 attention_probs_dropout_prob: 0.0 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False comm_overlap: False Optimizer: weight_decay: 0.0 lr: decay_steps: 90000 warmup_rate: 0.00 max_lr: 2.5e-5 min_lr: 5.0e-6 Compress: pretrained: Prune: enable: True criterion: l1_norm ratio: 0.125 ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/qat_gpt_345M_mp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 1 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: fused_linear: True Distributed: dp_degree: mp_degree: 8 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True freeze_embedding: True skip_tensor_map: block_3: ['linear2'] block_5: ['linear1'] block_6: ['linear2'] block_7: ['linear2'] block_10: ['linear2'] block_20: ['linear2'] block_21: ['linear2'] ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/qat_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True freeze_embedding: True skip_tensor_map: block_3: ['linear2'] block_5: ['linear1'] block_6: ['linear2'] block_7: ['linear2'] block_10: ['linear2'] block_20: ['linear2'] block_21: ['linear2'] ================================================ FILE: examples/transformer/models/GPT/pretrain/configs/qat_gpt_6.7B_sharding16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 logging_freq: 10 Model: vocab_size: 50304 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 16 sharding_stage: 2 sharding_offload: False reduce_overlap: True broadcast_overlap: True Optimizer: tensor_fusion: True Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: examples/transformer/models/GPT/pretrain/export.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist from paddle.static import InputSpec __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from ppfleetx.utils.export import export_inference_model from examples.transformer.utils import qat from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=False) paddle.set_device(config.Global.device) # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.process_configs(config) cfg.print_config(config) if config.Global.mix_precision.enable: logger.info("NOTE: disable mix_precision in export mode") # build GPT model model, _, _ = impls.build_model(config) # export model.eval() input_spec = [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] output_dir = config.Global.save_load.output_dir dp_rank = 0 if nranks == 1 else env.get_hcg().get_data_parallel_rank() save_dir = os.path.join(output_dir, "rank_{}".format(dp_rank)) quanter = None quant_mode = False if 'Compress' in config: mode = 'compress' compress_configs = config['Compress'] if "Quantization" in compress_configs: quant_mode = True model, quanter = qat.compress_model(config, model, input_spec) # load pretrained checkpoints if config.Global.save_load.ckpt_dir is not None: io.load( config.Global.save_load.ckpt_dir, model, optimizer=None, mode='export', load_recovery=None) if not quant_mode: export_inference_model(model, input_spec, save_dir, 'model') else: logger.info("export quantized model.") export_inference_model( model, input_spec, save_dir, 'model', export_quant_model=True, quanter=quanter) ================================================ FILE: examples/transformer/models/GPT/pretrain/impls.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle import paddle.distributed as dist from paddle.optimizer.lr import LRScheduler from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.utils.log import logger from ppfleetx.distributed.apis import env, amp import ppfleetx.models.language_model.gpt as gpt from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks MODEL_CLASSES = { "GPT": (GPTTokenizer, "gpt2"), "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"), } def _get_model_size(l, h, v, s): P = 0 # embedding P += (v + s) * h # attention P += (4 * h * h + 4 * h) * l # layer_norm of decoder P += (2 * (2 * h)) * l # FFN Layer P += (8 * h * h + 5 * h) * l # layer_norm of transformer P += 2 * h logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) def _vocab_size_with_padding(vocab_size, div_unit, mp_degree): padded_size = vocab_size multiple = div_unit * mp_degree while (padded_size % multiple) != 0: padded_size += 1 logger.warning(' > padded vocab (size: {}) with {} dummy tokens ' '(new size: {})'.format(vocab_size, padded_size - vocab_size, padded_size)) return padded_size def build_model(config): nranks = dist.get_world_size() model_setting = copy.deepcopy(config.Model) if 'Compress' in config and 'Quantization' in config.Compress: quant_setting = copy.deepcopy(config.Compress.Quantization) model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map', {}) model_setting['freeze_embedding'] = quant_setting.get( 'freeze_embedding', False) model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] tokenizer = tokenizer_class.from_pretrained(pretrained_name) model_setting['vocab_size'] = _vocab_size_with_padding( model_setting.get('vocab_size', tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit', 128), config.Distributed.get('mp_degree', 1)) l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = config.Data.Train.dataset.max_seq_len _get_model_size(l, h, v, s) if nranks == 1: model_setting.pop("sequence_parallel") model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) else: model_setting['num_partitions'] = config.Distributed.mp_degree if config.Distributed.pp_degree == 1: model_setting.pop("virtual_pp_degree", None) model = gpt.GPTForPretrainingHybrid( gpt.GPTModelHybrid(**model_setting)) else: model = gpt.GPTForPretrainingPipe(**model_setting) if config.Model.sequence_parallel: register_sequence_parallel_allreduce_hooks( model, config.Global.accumulate_steps, config.Distributed.fuse_sequence_parallel_allreduce) if nranks == 1: loss_fn = gpt.GPTPretrainingCriterion() else: loss_fn = gpt.GPTPretrainingCriterionHybird( sequence_parallel=config.Model.sequence_parallel) return model, tokenizer, loss_fn def model_forward_backward(config, batch, forward_func, **kwargs): acc_steps = config.Global.accumulate_steps amp_enable = config.Global.mix_precision.enable amp_dtype = config.Global.mix_precision.dtype amp_level = config.Global.mix_precision.level black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list # train with pipeline strategy if config.Distributed.pp_degree > 1: tokens, position_ids, labels, loss_mask = batch batch = [(tokens, position_ids), (labels, loss_mask)] batches = [batch] with paddle.amp.auto_cast( amp_enable, custom_black_list=black_list, custom_white_list=white_list, dtype=amp_dtype, level=amp_level): batch = kwargs['model']._prepare_training( batch, kwargs['optimizer'], None) loss = kwargs['model'].forward_backward_pipeline(batch, kwargs['scaler']) return loss # train with non-pipeline strategy if acc_steps == 1: batches = [batch] else: split_batches = [paddle.split(b, acc_steps) for b in batch] batches = [] for i in range(len(split_batches[0])): micro_batch = [split_batch[i] for split_batch in split_batches] batches.append(micro_batch) # gradient merge strategy final_loss = None for micro_batch in batches: with paddle.amp.auto_cast( amp_enable, custom_black_list=black_list, custom_white_list=white_list, dtype=amp_dtype, level=amp_level): # forward in training step loss = forward_func(micro_batch, kwargs['model'], kwargs['loss_fn']) loss_bw = kwargs['scaler'].scale( loss) if amp_enable and amp_dtype == "float16" else loss loss_bw = loss_bw / acc_steps if acc_steps > 1 else loss_bw loss_bw.backward() detach_loss = loss.detach() if final_loss is None: final_loss = detach_loss else: final_loss = paddle.add(final_loss, detach_loss) final_loss = final_loss / acc_steps if acc_steps > 1 else final_loss return final_loss def optim_update_params(config, **kwargs): hcg = env.get_hcg() amp_enable = config.Global.mix_precision.enable amp_dtype = config.Global.mix_precision.dtype dp_degree = config.Distributed.dp_degree sharding_stage = config.Distributed.sharding.sharding_stage if config.Model.use_recompute and isinstance(kwargs['model'], paddle.DataParallel): if not hasattr(kwargs['optimizer'], "all_fused_tensors") or kwargs[ 'optimizer'].all_fused_tensors is None: fused_allreduce_gradients(list(kwargs['model'].parameters()), None) else: dp_group = hcg.get_data_parallel_group() all_reduce_parameters(kwargs['optimizer'].all_fused_tensors, dp_group) elif isinstance(kwargs['model'], amp.MixPrecisionLayer) \ and dist.get_world_size() > 1 and dist.get_world_size() == dp_degree: fused_allreduce_gradients(list(kwargs['model'].parameters()), None) if sharding_stage == 3 and dp_degree > 1: dp_group = hcg.get_data_parallel_group() fused_allreduce_gradients(kwargs['model'].parameters(), hcg) for p in kwargs['model'].parameters(): if hasattr(p, "bw_storage"): assert p.grad is None, "This case shouldn't happen." p.bw_storage.scale_(1.0 / dp_group.nranks) dist.all_reduce(p.bw_storage, group=dp_group) if amp_enable and amp_dtype == 'float16': kwargs['scaler'].step(kwargs['optimizer']) kwargs['scaler'].update() else: kwargs['optimizer'].step() def fit_impl(config, batch, forward_func, **kwargs): kwargs['model'].train() if config.Distributed.pp_degree == 1: if config.Model.use_recompute and isinstance(kwargs['model'], paddle.DataParallel): with kwargs['model'].no_sync(): loss = model_forward_backward(config, batch, forward_func, **kwargs) else: loss = model_forward_backward(config, batch, forward_func, **kwargs) else: loss = model_forward_backward(config, batch, forward_func, **kwargs) optim_update_params(config, **kwargs) return loss @paddle.no_grad() def eval_impl(config, batch, model, loss_fn): model.eval() amp_enable = config.Global.mix_precision.enable amp_dtype = config.Global.mix_precision.dtype amp_level = config.Global.mix_precision.level black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list with paddle.amp.auto_cast( amp_enable, custom_black_list=black_list, custom_white_list=white_list, dtype=amp_dtype, level=amp_level): tokens, position_ids, labels, loss_mask = batch if config.Distributed.pp_degree == 1: tokens, position_ids, labels, loss_mask = batch preds = model(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = loss_fn(preds, labels, loss_mask) else: batch = [(tokens, position_ids), (labels, loss_mask)] loss = model.eval_batch(batch, compute_loss=True) return loss ================================================ FILE: examples/transformer/models/GPT/pretrain/run.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist from paddle.static import InputSpec __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io, amp from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from examples.transformer.utils import qat from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=False) paddle.set_device(config.Global.device) # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.process_configs(config) cfg.print_config(config) # Note: Only for GPTDataset dataset_kwargs = { "seed": config.Global.seed, "model_type": config.Model.name, } sampler_kwargs = {"batch_size": config.Global.local_batch_size, } # build dataloader for training/eval dataset_kwargs.update({"mode": "Train"}) dataset = cpn.build_dataset(config.Data.Train.dataset, **dataset_kwargs) sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset, **sampler_kwargs) train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset, sampler) dataset_kwargs.update({"mode": "Eval"}) dataset = cpn.build_dataset(config.Data.Eval.dataset, **dataset_kwargs) sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset, **sampler_kwargs) valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset, sampler) # build GPT model model, tokenizer, loss_fn = impls.build_model(config) if 'Compress' in config: input_spec = [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] model, quanter = qat.compress_model(config, model, input_spec) amp_config = config.Global.mix_precision amp_enable = amp_config['enable'] amp_dtype = amp_config.get('dtype', 'float16') amp_level = amp_config.get('level', 'O2') amp_use_main_grad = amp_config.get('use_main_grad', False) amp_scale_loss = amp_config.get('scale_loss', 32768) if amp_enable: if amp_dtype == "float16": scaler = paddle.amp.GradScaler(init_loss_scaling=amp_scale_loss) elif amp_dtype == "bfloat16": scaler = paddle.amp.GradScaler( init_loss_scaling=1, use_dynamic_loss_scaling=False) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. model = paddle.amp.decorate( models=model, level=amp_level, dtype=amp_dtype) else: scaler = None config.Optimizer.lr.update({ 'epochs': config.Global.num_train_epochs, 'step_each_epoch': len(train_data_loader), 'total_steps': config.Global.max_steps, }) use_increments = config.Optimizer.lr.pop('use_increments', False) # build lr and optim lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr) optimizer = cpn.build_optimizer( config.Optimizer, model, lr_scheduler, multi_precision=config.Global.mix_precision.enable) if amp_enable and amp_dtype in [ 'float16', 'bfloat16' ] and amp_level == 'O2' and amp_use_main_grad: model = amp.MixPrecisionLayer(model, dtype=amp_dtype) optimizer = amp.MixPrecisionOptimizer(optimizer) scaler = amp.MixPrecisionScaler(scaler) # call fleet wrapper if nranks > 1: model, optimizer, scaler = strategy.wrap_with_fleet( config.Distributed, model, optimizer, scaler) # load pretrained checkpoints load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1} if config.Global.save_load.ckpt_dir is not None: io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train', load_recovery) # build profiler if config.get('Profiler', {}).get('enable', False): profiler = cpn.build_profiler(config.Profiler) else: profiler = None # start training train_start = log.get_timestamp() if load_recovery['rng_state'] != -1: paddle.set_cuda_rng_state(load_recovery['rng_state']) for epoch_index in range(load_recovery['epoch'], config.Global.num_train_epochs): train_epoch_start = log.get_timestamp() # time count train_losses = [] train_step_start = log.get_timestamp() # Note(GuoxiaWang): Do not use len(train_data_loader()), # it will cause a memory leak. total_train_batch = len(train_data_loader) total_train_step = config.Global.max_steps total_eval_batch = len( valid_data_loader) if valid_data_loader is not None else 0 valid_data_loader = valid_data_loader( ) if valid_data_loader is not None else None eval_finished_step = 0 for step, batch in enumerate(train_data_loader()): if epoch_index == load_recovery['epoch']: if step < load_recovery['step']: continue model.train() fit_kwargs = { "model": model, "loss_fn": loss_fn, "scaler": scaler, "optimizer": optimizer, } def forward_func(batch, model, loss_fn): tokens, position_ids, labels, loss_mask = batch loss_mask.stop_gradient = True labels.stop_gradient = True position_ids.stop_gradient = True preds = model(tokens, position_ids) loss = loss_fn(preds, labels, loss_mask) return loss loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs) train_losses.append(loss) if lr_scheduler is not None: if scaler is None or scaler._found_inf == 0: lr_scheduler.step(epoch=config.Global.global_batch_size if use_increments else None) # training step log if (step + 1) % config.Global.logging_freq == 0: train_step_cost = log.get_timestamp() - train_step_start numpy_losses = [float(loss) for loss in train_losses] train_cost = train_step_cost \ if step == 0 else train_step_cost / config.Global.logging_freq speed = 1. / train_cost default_global_tokens_num = config.Global.global_batch_size * \ config.Data.Train.dataset.max_seq_len ips_total = speed * default_global_tokens_num ips = ips_total / env.get_data_world_size() loss_scale_str = " loss_scale: %.9f," % ( scaler._scale.numpy()[0]) if scaler is not None else "" logger.info( "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s,%s learning rate: %.5e, found_inf: %d" % (epoch_index, config.Global.num_train_epochs, step, total_train_step, sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips, loss_scale_str, optimizer.get_lr(), scaler._found_inf if scaler is not None else 0)) train_step_start = log.get_timestamp() train_losses = [] optimizer.clear_grad() # start eval if step > 0 and config.Global.eval_freq > 0 and step % config.Global.eval_freq == 0: eval_losses = [] eval_step_start = log.get_timestamp() for eval_step, batch in enumerate(valid_data_loader): eval_finished_step += 1 loss = impls.eval_impl(config, batch, model, loss_fn) eval_losses.append(loss) if eval_step >= config.Global.eval_iters - 1: break eval_step_cost = log.get_timestamp() - eval_step_start eval_loss = sum(eval_losses) / len(eval_losses) eval_cost = eval_step_cost / config.Global.logging_freq logger.info( "[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (epoch_index, eval_step, eval_finished_step, float(eval_loss), eval_cost, 1. / eval_cost)) if step > 0 and config.Global.save_load.save_steps > 0 and \ step % config.Global.save_load.save_steps == 0: device.synchronize() io.save( config.Global.save_load.output_dir, model, optimizer, step=step, epoch=epoch_index, sharding_stage=config.Distributed.sharding.sharding_stage) if step >= config.Global.max_steps: break if profiler: profiler.step() # training epoch log train_epoch_cost = log.get_timestamp() - train_epoch_start logger.info("[Training] epoch: %d, total time: %.5f sec" % (epoch_index, train_epoch_cost)) # training end log logger.info( "The training process is complete and total cost of time for training is : {}". format( log.convert_timestamp_to_data(log.get_timestamp() - train_start))) if profiler: cpn.profiler_done(profiler, config.Profiler) ================================================ FILE: examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml ================================================ _base_: ./pretrain_moe_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 2 max_steps: 20000 logging_freq: 10 mix_precision: enable: True Data: Train: dataset: split: [98,2,0] loader: num_workers: 0 Eval: dataset: split: [98,2,0] Model: vocab_size: 50304 hidden_size: 768 num_layers: 12 num_attention_heads: 12 ffn_hidden_size: 3072 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.014 use_recompute: True recompute_granularity: no_recompute_layers: num_experts: 2, expert_interval: 2 topk: 1 moe_use_residual: False #True moe_train_capacity_factor: 1.0 moe_eval_capacity_factor: 1.0 moe_min_capacity: 4 moe_token_dropping: True balance_loss_weight: 0.01 enable_expert_tensor_parallelism: False Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml ================================================ Global: device: gpu seed: 1234 global_batch_size: local_batch_size: 1 micro_batch_size: 1 max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 1000 eval_iters: 10 test_iters: mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: name: "GPT" fused_linear: False fuse_attn_qkv: True sequence_parallel: False no_recompute_layers: Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Eval: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Optimizer: name: FusedAdamW weight_decay: 0.1 beta1: 0.9 beta2: 0.95 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 20000 warmup_rate: 0.01 max_lr: 4.5e-4 min_lr: 4.5e-6 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Distributed: fuse_sequence_parallel_allreduce: False ================================================ FILE: examples/transformer/models/GPT/pretrain_moe/impls.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import numpy as np import paddle import paddle.distributed as dist from paddle.optimizer.lr import LRScheduler from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from ppfleetx.utils.log import logger from ppfleetx.distributed.apis import env import ppfleetx.models.language_model.gpt as gpt from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks MODEL_CLASSES = { "GPT": (GPTTokenizer, "gpt2"), "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"), } def _get_model_size(l, h, v, s, ne, ei): assert len(ne) == 1 or len(ne) == l // ei, \ 'num_experts must be either a single value or a list of the same length as the number of MoE layers' P = 0 # embedding P += (v + s) * h logger.info(f'vs: {v} {s}') moe_mode = True if len(ne) == 1: if ne[0] == 1: moe_mode = False ne = ne * (l // ei) for i in range(l): # attention P += 4 * h * h + 4 * h # layer_norm of decoder P += 2 * (2 * h) # MoE Layer if ((i + 1) % ei == 0) and moe_mode: nei = ne[i // ei] # gate P += (h * nei + nei) # experts P += nei * (8 * h * h + 5 * h) # FFN Layer else: P += 8 * h * h + 5 * h # layer_norm of transformer P += 2 * h logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) def build_model(config): nranks = dist.get_world_size() model_setting = copy.deepcopy(config.Model) if 'Compress' in config and 'Quantization' in config.Compress: quant_setting = copy.deepcopy(config.Compress.Quantization) model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map', {}) model_setting['freeze_embedding'] = quant_setting.get( 'freeze_embedding', False) l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = model_setting['max_position_embeddings'] ne = model_setting['num_experts'] ei = model_setting['expert_interval'] _get_model_size(l, h, v, s, ne, ei) model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] tokenizer = tokenizer_class.from_pretrained(pretrained_name) model_setting.pop("balance_loss_weight") if nranks == 1: model_setting.pop("sequence_parallel") model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) else: model_setting['num_partitions'] = config.Distributed.mp_degree if config.Distributed.pp_degree == 1: model_setting.pop("virtual_pp_degree", None) model = gpt.GPTForPretrainingHybrid( gpt.GPTModelHybrid(**model_setting)) else: model = gpt.GPTForPretrainingPipe(**model_setting) if config.Model.sequence_parallel: register_sequence_parallel_allreduce_hooks( model, config.Global.accumulate_steps, config.Distributed.fuse_sequence_parallel_allreduce) if nranks == 1: loss_fn = gpt.GPTPretrainingCriterion() else: loss_fn = gpt.GPTPretrainingCriterionHybird( sequence_parallel=config.Model.sequence_parallel) return model, tokenizer, loss_fn def model_forward_backward(config, batch, forward_func, **kwargs): acc_steps = config.Global.accumulate_steps use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list # HACK: add 'expand' to black_list (put_along_axis_) black_list.append('expand_v2') # train with pipeline strategy if config.Distributed.pp_degree > 1: tokens, position_ids, labels, loss_mask = batch batch = [(tokens, position_ids), (labels, loss_mask)] batches = [batch] with paddle.amp.auto_cast( use_fp16, custom_black_list=black_list, custom_white_list=white_list, level='O2'): batch = kwargs['model']._prepare_training( batch, kwargs['optimizer'], None) loss = kwargs['model'].forward_backward_pipeline(batch, kwargs['scaler']) return loss # train with non-pipeline strategy if acc_steps == 1: batches = [batch] else: split_batches = [paddle.split(b, acc_steps) for b in batch] batches = [] for i in range(len(split_batches[0])): micro_batch = [split_batch[i] for split_batch in split_batches] batches.append(micro_batch) # gradient merge strategy final_loss = None for micro_batch in batches: with paddle.amp.auto_cast( use_fp16, custom_black_list=black_list, custom_white_list=white_list, level='O2'): # forward in training step loss = forward_func(micro_batch, kwargs['model'], kwargs['loss_fn']) # calculate auxiliary loss to balance experts' load if max(config.Model. num_experts) > 1 and config.Model.balance_loss_weight: aux_loss_list = [ l.moe_mlp.fleetx_moe.get_loss() for l in kwargs['model'].gpt.decoder.layers if l.moe_mlp is not None ] bal_loss = paddle.concat(aux_loss_list) if bal_loss.dtype == paddle.float16: bal_loss = paddle.cast(bal_loss, dtype=paddle.float32) bal_loss = bal_loss.mean() loss += bal_loss * config.Model.balance_loss_weight loss_bw = kwargs['scaler'].scale(loss) if use_fp16 else loss loss_bw = loss_bw / acc_steps if acc_steps > 1 else loss_bw loss_bw.backward() detach_loss = loss.detach() if final_loss is None: final_loss = detach_loss else: final_loss = paddle.add(final_loss, detach_loss) final_loss = final_loss / acc_steps if acc_steps > 1 else final_loss return final_loss def optim_update_params(config, **kwargs): hcg = env.get_hcg() use_fp16 = config.Global.mix_precision.enable dp_degree = config.Distributed.dp_degree sharding_stage = config.Distributed.sharding.sharding_stage if config.Model.use_recompute and isinstance(kwargs['model'], paddle.DataParallel): if not hasattr(kwargs['optimizer'], "all_fused_tensors") or kwargs[ 'optimizer'].all_fused_tensors is None: fused_allreduce_gradients(list(kwargs['model'].parameters()), None) else: dp_group = hcg.get_data_parallel_group() all_reduce_parameters(kwargs['optimizer'].all_fused_tensors, dp_group) if sharding_stage == 3 and dp_degree > 1: dp_group = hcg.get_data_parallel_group() fused_allreduce_gradients(kwargs['model'].parameters(), hcg) for p in kwargs['model'].parameters(): if hasattr(p, "bw_storage"): assert p.grad is None, "This case shouldn't happen." p.bw_storage.scale_(1.0 / dp_group.nranks) dist.all_reduce(p.bw_storage, group=dp_group) if use_fp16: kwargs['scaler'].step(kwargs['optimizer']) kwargs['scaler'].update() else: kwargs['optimizer'].step() def fit_impl(config, batch, forward_func, **kwargs): kwargs['model'].train() if config.Distributed.pp_degree == 1: if config.Model.use_recompute and isinstance(kwargs['model'], paddle.DataParallel): with kwargs['model'].no_sync(): loss = model_forward_backward(config, batch, forward_func, **kwargs) else: loss = model_forward_backward(config, batch, forward_func, **kwargs) else: loss = model_forward_backward(config, batch, forward_func, **kwargs) optim_update_params(config, **kwargs) return loss @paddle.no_grad() def eval_impl(config, batch, model, loss_fn): model.eval() use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list with paddle.amp.auto_cast( use_fp16, custom_black_list=black_list, custom_white_list=white_list, level='O2'): tokens, position_ids, labels, loss_mask = batch if config.Distributed.pp_degree == 1: tokens, position_ids, labels, loss_mask = batch preds = model(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = loss_fn(preds, labels, loss_mask) else: batch = [(tokens, position_ids), (labels, loss_mask)] loss = model.eval_batch(batch, compute_loss=True) return loss ================================================ FILE: examples/transformer/models/GPT/pretrain_moe/run.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist from paddle.static import InputSpec __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(1, os.path.abspath(os.path.join(__dir__, '../../../../../'))) from ppfleetx.distributed.apis import env, strategy, io from ppfleetx.utils.log import logger from ppfleetx.utils import device, log from examples.transformer.utils import config as cfg from examples.transformer.utils import components as cpn import impls if __name__ == "__main__": # parse config from yaml args = cfg.parse_args() config = cfg.get_config(args.config, overrides=args.override, show=True) # HACK: use certain device paddle.set_device(config.Global.device + ':3') # init distributed env nranks = dist.get_world_size() if nranks > 1: env.init_dist_env(config) env.set_seed(config.Global.seed) cfg.process_configs(config) cfg.print_config(config) # Note: Only for GPTDataset dataset_kwargs = { "seed": config.Global.seed, "model_type": config.Model.name, } sampler_kwargs = {"batch_size": config.Global.local_batch_size, } # build dataloader for training/eval dataset_kwargs.update({"mode": "Train"}) dataset = cpn.build_dataset(config.Data.Train.dataset, **dataset_kwargs) sampler = cpn.build_batch_sampler(config.Data.Train.sampler, dataset, **sampler_kwargs) train_data_loader = cpn.build_dataloader(config.Data.Train.loader, dataset, sampler) dataset_kwargs.update({"mode": "Eval"}) dataset = cpn.build_dataset(config.Data.Eval.dataset, **dataset_kwargs) sampler = cpn.build_batch_sampler(config.Data.Eval.sampler, dataset, **sampler_kwargs) valid_data_loader = cpn.build_dataloader(config.Data.Eval.loader, dataset, sampler) # build GPT model model, tokenizer, loss_fn = impls.build_model(config) if 'Compress' in config: from examples.transformer.utils import qat input_spec = [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] model, quanter = qat.compress_model(config, model, input_spec) if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. model = paddle.amp.decorate(models=model, level='O2') else: scaler = None config.Optimizer.lr.update({ 'epochs': config.Global.num_train_epochs, 'step_each_epoch': len(train_data_loader), 'total_steps': config.Global.max_steps, }) # build lr and optim lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr) optimizer = cpn.build_optimizer( config.Optimizer, model, lr_scheduler, multi_precision=config.Global.mix_precision.enable) # call fleet wrapper if nranks > 1: model, optimizer, scaler = strategy.wrap_with_fleet( config.Distributed, model, optimizer, scaler) # load pretrained checkpoints load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1} if config.Global.save_load.ckpt_dir is not None: io.load(config.Global.save_load.ckpt_dir, model, optimizer, 'train', load_recovery) # build profiler if config.get('Profiler', {}).get('enable', False): profiler = cpn.build_profiler(config.Profiler) else: profiler = None # start training train_start = log.get_timestamp() if load_recovery['rng_state'] != -1: paddle.set_cuda_rng_state(load_recovery['rng_state']) for epoch_index in range(load_recovery['epoch'], config.Global.num_train_epochs): train_epoch_start = log.get_timestamp() # time count train_losses = [] train_step_start = log.get_timestamp() # Note(GuoxiaWang): Do not use len(train_data_loader()), # it will cause a memory leak. total_train_batch = len(train_data_loader) total_eval_batch = len( valid_data_loader) if valid_data_loader is not None else 0 for step, batch in enumerate(train_data_loader): if epoch_index == load_recovery['epoch']: if step <= load_recovery['step']: continue model.train() fit_kwargs = { "model": model, "loss_fn": loss_fn, "scaler": scaler, "optimizer": optimizer, } def forward_func(batch, model, loss_fn): tokens, position_ids, labels, loss_mask = batch loss_mask.stop_gradient = True labels.stop_gradient = True position_ids.stop_gradient = True preds = model(tokens, position_ids) loss = loss_fn(preds, labels, loss_mask) return loss loss = impls.fit_impl(config, batch, forward_func, **fit_kwargs) train_losses.append(loss) # training step log if (step + 1) % config.Global.logging_freq == 0: train_step_cost = log.get_timestamp() - train_step_start numpy_losses = [loss.numpy()[0] for loss in train_losses] train_cost = train_step_cost \ if step == 0 else train_step_cost / config.Global.logging_freq speed = 1. / train_cost default_global_tokens_num = config.Global.global_batch_size * \ config.Data.Train.dataset.max_seq_len ips_total = speed * default_global_tokens_num ips = ips_total / env.get_data_world_size() logger.info( "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e" % (epoch_index, step, sum(numpy_losses) / len(numpy_losses), train_cost, speed, ips_total, ips, optimizer.get_lr())) train_step_start = log.get_timestamp() train_losses = [] if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() # start eval if step > 0 and config.Global.eval_freq > 0 and step % config.Global.eval_freq == 0: eval_losses = [] eval_step_start = log.get_timestamp() for eval_step, batch in enumerate(valid_data_loader): loss = impls.eval_impl(config, batch, model, loss_fn) eval_losses.append(loss) if eval_step >= config.Global.eval_iters - 1: break eval_step_cost = log.get_timestamp() - eval_step_start eval_loss = sum(eval_losses) / len(eval_losses) eval_cost = eval_step_cost / config.Global.logging_freq logger.info( "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (epoch_index, eval_step, eval_loss.numpy()[0], eval_cost, 1. / eval_cost)) if step > 0 and config.Global.save_load.save_steps > 0 and \ step % config.Global.save_load.save_steps == 0: device.synchronize() io.save( config.Global.save_load.output_dir, model, optimizer, step=step, epoch=epoch_index, sharding_stage=config.Distributed.sharding.sharding_stage) if step >= config.Global.max_steps: break if profiler: profiler.step() # training epoch log train_epoch_cost = log.get_timestamp() - train_epoch_start logger.info("[Training] epoch: %d, total time: %.5f sec" % (epoch_index, train_epoch_cost)) # training end log logger.info( "The training process is complete and total cost of time for training is : {}". format( log.convert_timestamp_to_data(log.get_timestamp() - train_start))) if profiler: cpn.profiler_done(profiler, config.Profiler) ================================================ FILE: examples/transformer/utils/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: examples/transformer/utils/components.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import random import numpy as np import paddle import paddle.distributed as dist from paddle.optimizer.lr import LRScheduler from paddle.profiler import SummaryView from ppfleetx.data import dataset, sampler, utils from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger from ppfleetx.optims import optimizer, grad_clip, lr_scheduler def build_dataset(config_dataset, **config_kwargs): # build dataset if config_dataset is not None: config_dataset = copy.deepcopy(config_dataset) dataset_name = config_dataset.pop('name') config_dataset.update(config_kwargs) dataset = eval("dataset.{}".format(dataset_name))(**config_dataset) logger.debug("build dataset({}) success...".format(dataset)) else: dataset = None return dataset def build_batch_sampler(config_sampler, dataset, **config_kwargs): # build sampler if config_sampler is not None: config_sampler = copy.deepcopy(config_sampler) sampler_name = config_sampler.pop("name") config_sampler.update(config_kwargs) batch_sampler = eval("sampler.{}".format(sampler_name))( dataset, **config_sampler) logger.debug("build batch_sampler({}) success...".format( batch_sampler)) else: batch_sampler = None return batch_sampler def build_dataloader(config_loader, dataset, batch_sampler=None, **config_kwargs): collate_fn = None if config_loader is not None: config_loader = copy.deepcopy(config_loader) config_loader.update(config_kwargs) collate_fn_cfg = config_loader.pop('collate_fn', None) if isinstance(collate_fn_cfg, str): collate_fn = getattr( utils, collate_fn_cfg) if collate_fn_cfg is not None else None elif isinstance(collate_fn_cfg, dict): collate_fn_class_name = collate_fn_cfg.pop("name") collate_fn = eval("utils.{}".format(collate_fn_class_name))( **collate_fn_cfg) logger.debug("build collate_fn({}) success...".format(collate_fn)) def worker_init_fn(worker_id): """ set seed in subproces for dataloader when num_workers > 0""" np.random.seed(env.get_dp_seed() + worker_id) random.seed(env.get_dp_seed() + worker_id) data_loader = paddle.io.DataLoader( dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, worker_init_fn=worker_init_fn, **config_loader) logger.debug("build data_loader({}) success...".format(data_loader)) return data_loader def build_lr_scheduler(lr_config): if 'name' in lr_config: lr_name = lr_config.pop('name') lr = eval("lr_scheduler.{}".format(lr_name))(**lr_config) if isinstance(lr, LRScheduler): return lr else: return lr() else: lr = lr_config.learning_rate logger.debug("build lr ({}) success..".format(lr)) return lr def build_grad_clip(grad_clip_config): if grad_clip_config is not None: grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') grad_clip = eval("grad_clip.{}".format(grad_clip_name))( **grad_clip_config) return grad_clip else: return None def build_optimizer(config, model, lr_scheduler=None, multi_precision=False): config = copy.deepcopy(config) if lr_scheduler is not None: config.pop('lr') grad_clip_config = config.pop('grad_clip', None) grad_clip = build_grad_clip(grad_clip_config) optim_name = config.pop('name') optim = eval("optimizer.{}".format(optim_name))( learning_rate=lr_scheduler, parameters=model.parameters(), grad_clip=grad_clip, multi_precision=multi_precision, **config) logger.debug("build optimizer ({}) success..".format(optim)) return optim def build_profiler(profiler_config): profiler = None if profiler_config.get('enable', False): scheduler = profiler_config.get('scheduler', None) profiler_log = profiler_config.get('profiler_log', './profiler_log') record_shapes = profiler_config.get('record_shapes', True) profile_memory = profiler_config.get('profile_memory', True) profiler = paddle.profiler.Profiler( targets=[ paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU ], scheduler=scheduler, on_trace_ready=paddle.profiler.export_chrome_tracing(profiler_log), record_shapes=record_shapes, profile_memory=profile_memory) profiler.start() logger.warning("Profiler is enabled, do not enable it in production.") return profiler def profiler_done(profiler, profiler_config): if not profiler: return logger.info("Profiler finished, prepare to print summary...") profiler.stop() _print_summary(profiler, profiler_config) profiler_log = profiler_config.get('profiler_log', './profiler_log') logger.info( "For more information please install visualdl and run it with following command:" ) logger.info( "-------------------------------------------------------------------------------" ) logger.info(f"visualdl --host 0.0.0.0 --logdir {profiler_log}") logger.info( "-------------------------------------------------------------------------------" ) def _print_summary(profiler, profiler_config): views_dict = { SummaryView.DeviceView: 'device', SummaryView.OverView: 'overview', SummaryView.ModelView: 'model', SummaryView.DistributedView: 'dist', SummaryView.KernelView: 'kernel', SummaryView.OperatorView: 'op', SummaryView.MemoryView: 'mem', SummaryView.MemoryManipulationView: 'memcpy', SummaryView.UDFView: 'udf', } default_views = [ SummaryView.OverView, SummaryView.ModelView, SummaryView.KernelView, SummaryView.OperatorView, ] def gen_views(cfg): # print all summary view if detailed=True if profiler_config.get('detailed', False): return None views = [] # override default view with user defined value if detailed=False for view in SummaryView: v = profiler_config.get('summary', {}).get(views_dict[view], None) if v is True or (v is None and view in default_views): views.append(view) return views or None profiler.summary( sorted_by=paddle.profiler.SortedKeys.GPUTotal, views=gen_views(profiler_config)) ================================================ FILE: examples/transformer/utils/config.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import sys import copy import argparse import codecs import yaml import numpy as np import paddle import paddle.distributed as dist from paddle.fluid import core from paddle.fluid.reader import use_pinned_memory from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger, advertise from ppfleetx.utils import check __all__ = ['get_config', 'print_config'] class AttrDict(dict): def __getattr__(self, key): return self[key] def __setattr__(self, key, value): if key in self.__dict__: self.__dict__[key] = value else: self[key] = value def __copy__(self): cls = self.__class__ result = cls.__new__(cls) result.__dict__.update(self.__dict__) return result def __deepcopy__(self, memo): cls = self.__class__ result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): setattr(result, k, copy.deepcopy(v, memo)) for k, v in self.items(): setattr(result, k, copy.deepcopy(v, memo)) return result def setdefault(self, k, default=None): if k not in self or self[k] is None: self[k] = default return default else: return self[k] def create_attr_dict(yaml_config): from ast import literal_eval for key, value in yaml_config.items(): if type(value) is dict: yaml_config[key] = value = AttrDict(value) if isinstance(value, str): try: value = literal_eval(value) except BaseException: pass if isinstance(value, AttrDict): create_attr_dict(yaml_config[key]) else: yaml_config[key] = value def parse_config(cfg_file): """Load a config file into AttrDict""" def _update_dic(dic, base_dic): '''Update config from dic based base_dic ''' base_dic = base_dic.copy() dic = dic.copy() if dic.get('_inherited_', True) == False: dic.pop('_inherited_') return dic for key, val in dic.items(): if isinstance(val, dict) and key in base_dic: base_dic[key] = _update_dic(val, base_dic[key]) else: base_dic[key] = val dic = base_dic return dic def _parse_from_yaml(path): '''Parse a yaml file and build config''' with codecs.open(path, 'r', 'utf-8') as file: dic = yaml.load(file, Loader=yaml.FullLoader) if '_base_' in dic: cfg_dir = os.path.dirname(path) base_path = dic.pop('_base_') base_path = os.path.join(cfg_dir, base_path) base_dic = _parse_from_yaml(base_path) dic = _update_dic(dic, base_dic) return dic yaml_dict = _parse_from_yaml(cfg_file) yaml_config = AttrDict(yaml_dict) create_attr_dict(yaml_config) return yaml_config def print_dict(d, delimiter=0): """ Recursively visualize a dict and indenting acrrording by the relationship of keys. """ placeholder = "-" * 60 for k, v in sorted(d.items()): if isinstance(v, dict): logger.info("{}{} : ".format(delimiter * " ", k)) print_dict(v, delimiter + 4) elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict): logger.info("{}{} : ".format(delimiter * " ", k)) for value in v: print_dict(value, delimiter + 4) else: logger.info("{}{} : {}".format(delimiter * " ", k, v)) if k.isupper(): logger.info(placeholder) def print_config(config): """ visualize configs Arguments: config: configs """ advertise() print_dict(config) def check_config(config): """ Check config """ # global_batch_size = config.get("") global_config = config.get('Global') check.check_version() device = global_config.get('device', 'gpu') device = device.lower() if device in ['gpu', 'xpu', 'rocm', 'npu', "cpu"]: check.check_device(device) else: raise ValueError( f"device({device}) is not in ['gpu', 'xpu', 'rocm', 'npu', 'cpu'],\n" "Please ensure the config option Global.device is one of these devices" ) def override(dl, ks, v): """ Recursively replace dict of list Args: dl(dict or list): dict or list to be replaced ks(list): list of keys v(str): value to be replaced """ def str2num(v): try: return eval(v) except Exception: return v assert isinstance(dl, (list, dict)), ("{} should be a list or a dict") assert len(ks) > 0, ('lenght of keys should larger than 0') if isinstance(dl, list): k = str2num(ks[0]) if len(ks) == 1: assert k < len(dl), ('index({}) out of range({})'.format(k, dl)) dl[k] = str2num(v) else: override(dl[k], ks[1:], v) else: if len(ks) == 1: # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl)) if not ks[0] in dl: print('A new field ({}) detected!'.format(ks[0], dl)) dl[ks[0]] = str2num(v) else: if ks[0] not in dl.keys(): dl[ks[0]] = {} print("A new Series field ({}) detected!".format(ks[0], dl)) override(dl[ks[0]], ks[1:], v) def override_config(config, options=None): """ Recursively override the config Args: config(dict): dict to be replaced options(list): list of pairs(key0.key1.idx.key2=value) such as: [ 'topk=2', 'VALID.transforms.1.ResizeImage.resize_short=300' ] Returns: config(dict): replaced config """ if options is not None: for opt in options: assert isinstance(opt, str), ( "option({}) should be a str".format(opt)) assert "=" in opt, ( "option({}) should contain a =" "to distinguish between key and value".format(opt)) pair = opt.split('=') assert len(pair) == 2, ("there can be only a = in the option") key, value = pair keys = key.split('.') override(config, keys, value) return config def get_config(fname, overrides=None, show=False): """ Read config from file """ assert os.path.exists(fname), ( 'config file({}) is not exist'.format(fname)) config = parse_config(fname) override_config(config, overrides) process_dist_config(config) process_global_configs(config) create_attr_dict(AttrDict(config)) if show: print_config(config) check_config(config) return config def parse_args(): parser = argparse.ArgumentParser("train script") parser.add_argument( '-c', '--config', type=str, default='configs/config.yaml', help='config file path') parser.add_argument( '-o', '--override', action='append', default=[], help='config options to be overridden') args = parser.parse_args() return args def is_fused_matmul_bias_supported(): if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue') else: return False def process_dist_config(configs): """ process distributed strategy for hybrid parallel """ nranks = dist.get_world_size() config = configs['Distributed'] config.setdefault("hcg", "HybridCommunicateGroup") mp_degree = config.setdefault("mp_degree", 1) pp_degree = config.setdefault("pp_degree", 1) pp_recompute_interval = config.setdefault("pp_recompute_interval", 1) # sharding default sharding_config = config['sharding'] sharding_degree = sharding_config.setdefault("sharding_degree", 1) sharding_stage = sharding_config.setdefault('sharding_stage', 2) sharding_offload = sharding_config.setdefault('sharding_offload', False) reduce_overlap = sharding_config.setdefault('reduce_overlap', False) broadcast_overlap = sharding_config.setdefault('broadcast_overlap', False) other_degree = mp_degree * pp_degree * sharding_degree assert nranks % other_degree == 0, "unreasonable config of dist_strategy." dp_degree = config.setdefault("dp_degree", nranks // other_degree) assert nranks % dp_degree == 0, "unreasonable config of dist_strategy." assert nranks == dp_degree * other_degree, \ "Mismatched config using {} cards with dp_degree[{}]," \ "mp_degree[{}], pp_degree[{}] and sharding_degree[{}]".format(nranks, \ dp_degree, mp_degree, pp_degree, sharding_degree) if sharding_config['sharding_degree'] > 1 and reduce_overlap: if sharding_config['sharding_stage'] == 3 or sharding_config[ 'sharding_offload']: sharding_config['reduce_overlap'] = False logger.warning( "reduce overlap only valid for sharding stage 2 without offload" ) if sharding_config['sharding_degree'] > 1 and broadcast_overlap: if sharding_config['sharding_stage'] == 3 or sharding_config[ 'sharding_offload']: sharding_config['broadcast_overlap'] = False logger.warning( "broadcast overlap only valid for sharding stage 2 without offload" ) if broadcast_overlap and configs['Global']['logging_freq'] == 1: logger.warning( "Set logging_freq to 1 will disable broadcast_overlap. " "If you want to overlap the broadcast, please increase the logging_freq." ) sharding_config['broadcast_overlap'] = False if sharding_config['sharding_degree'] > 1: if getattr(sharding_config, 'broadcast_overlap', False): logger.warning( "Enable broadcast overlap for sharding will not use pin memory for dataloader" ) use_pinned_memory(False) if 'fuse_sequence_parallel_allreduce' not in config: config['fuse_sequence_parallel_allreduce'] = False def process_global_configs(config): """ process global configs for hybrid parallel """ dp_degree = config['Distributed']['dp_degree'] pp_degree = config['Distributed']['pp_degree'] sharding_degree = config['Distributed']['sharding']['sharding_degree'] config['Global']['enable_partial_send_recv'] = True if 'sequence_parallel' in config['Model'] and pp_degree > 1: if config['Model']['sequence_parallel']: config['Global']['enable_partial_send_recv'] = False logger.warning( "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \ "config.Global.enable_partial_send_recv will be set False." ) global_cfg = config['Global'] # Set environment variable flags = global_cfg.get("flags", {}) paddle.set_flags(flags) for k, v in flags.items(): logger.info("Environment variable {} is set {}.".format(k, v)) if global_cfg['global_batch_size'] is None and global_cfg[ 'local_batch_size'] is None: raise ValueError( "global_batch_size or local_batch_size should be set.") elif global_cfg['global_batch_size'] is not None and global_cfg[ 'local_batch_size'] is not None: assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == (dp_degree * sharding_degree), "global_batch_size[{}] should be divided by local_batch_size[{}] "\ "when dp_degree is [{}] and sharding_degree is [{}]".format(global_cfg['global_batch_size'], global_cfg['local_batch_size'], dp_degree, sharding_degree) elif global_cfg['global_batch_size'] is not None and global_cfg[ 'local_batch_size'] is None: assert global_cfg['global_batch_size'] % (dp_degree * sharding_degree) == 0, \ "global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]"\ .format(global_cfg['global_batch_size'], dp_degree, sharding_degree) global_cfg['local_batch_size'] = global_cfg['global_batch_size'] // ( dp_degree * sharding_degree) else: global_cfg['global_batch_size'] = global_cfg[ 'local_batch_size'] * dp_degree * sharding_degree assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0 # save_load global_cfg['save_load'] = global_cfg.get('save_load', {}) save_load_cfg = global_cfg.save_load save_steps = save_load_cfg.get('save_steps', None) save_epoch = save_load_cfg.get('save_epoch', None) if save_steps is None or save_steps == -1: save_load_cfg[ 'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint if save_epoch is None or save_epoch == -1: save_load_cfg['save_epoch'] = 1 save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output') save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None) # mix_precision global_cfg['mix_precision'] = global_cfg.get('mix_precision', {}) amp_cfg = global_cfg.mix_precision amp_cfg['enable'] = amp_cfg.get('enable', False) amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768) amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None) amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None) global_cfg['max_steps'] = global_cfg.get('max_steps', 500000) global_cfg['eval_freq'] = global_cfg.get('eval_freq', -1) global_cfg['eval_iters'] = global_cfg.get('eval_iters', 0) global_cfg['logging_freq'] = global_cfg.get('logging_freq', 1) global_cfg['num_train_epochs'] = global_cfg.get('num_train_epochs', 1) global_cfg['test_iters'] = global_cfg['eval_iters'] * 10 \ if global_cfg.get('test_iters', None) is None else global_cfg['test_iters'] global_cfg[ 'accumulate_steps'] = global_cfg.local_batch_size // global_cfg.micro_batch_size def process_model_configs(config): """ process model configs for hybrid parallel """ configs = config['Model'] if configs['ffn_hidden_size'] is None: configs['ffn_hidden_size'] = 4 * configs['hidden_size'] if configs['use_recompute']: if not configs['recompute_granularity']: configs['recompute_granularity'] = 'full' if not configs['no_recompute_layers']: configs['no_recompute_layers'] = [] else: assert isinstance(configs['no_recompute_layers'], list), "no_recompute_layers should be a list" for i in configs['no_recompute_layers']: assert isinstance( i, int ), "all values in no_recompute_layers should be an integer" assert min(configs['no_recompute_layers']) >= 0, \ "the min value in no_recompute_layers should >= 0" assert max(configs['no_recompute_layers']) < configs['num_layers'], \ "the max value in no_recompute_layers should < num_layers" configs['no_recompute_layers'] = sorted( list(set(configs['no_recompute_layers']))) if configs['fused_linear'] and not is_fused_matmul_bias_supported(): configs['fused_linear'] = False logging.warning( "The flag fused_linear only valid for cuda version higher than 11.6, " "but the paddle is compiled with cuda " + paddle.version.cuda()) pp_degree = config.Distributed.pp_degree if pp_degree > 1: configs['virtual_pp_degree'] = 1 \ if configs.get('virtual_pp_degree', None) is None \ else configs['virtual_pp_degree'] virtual_pp_degree = configs['virtual_pp_degree'] num_layers = configs.num_layers if not (num_layers % (virtual_pp_degree * pp_degree)) == 0: assert virtual_pp_degree == 1, "virtual pp doesn't support uneven layer split." logger.warning( "The num_layers of the model is not divisible by pp_degree." \ "Receive num_layers: {}, pp_degree: {}.".format(num_layers, pp_degree)) else: assert (num_layers % (virtual_pp_degree * pp_degree)) == 0, \ "The num_layers of the model should be divisible of pp_degree * virtual_pp_degree." \ "Receive num_layers: {}, pp_degree: {}, virtual_pp_degree: {}.".format( num_layers, pp_degree, virtual_pp_degree) if virtual_pp_degree > 1: local_batch_size = config.Global.local_batch_size micro_batch_size = config.Global.micro_batch_size acc_steps = local_batch_size // micro_batch_size assert acc_steps % pp_degree == 0, "num of microbatches {} should be divisible of pp_degree {} when " \ "using interleave pipeline".format(acc_steps, pp_degree) if virtual_pp_degree > 2: logger.warning( "Setting virtual_pp_degree > 2 may harm the throughput of the pipeline parallel." ) else: if configs.get('virtual_pp_degree', None): logger.warning("virtual_pp_degree is unuseful.") def process_optim_configs(config): """ process optim configs for hybrid parallel """ if 'Optimizer' not in config.keys(): return nranks = dist.get_world_size() dp_degree = config['Distributed']['dp_degree'] sharding_degree = config['Distributed']['sharding']['sharding_degree'] if config['Optimizer']['tensor_fusion']: assert nranks == dp_degree * sharding_degree, \ "tensor_fusion only support single card train or data/sharding parallel train" if config['Optimizer']['lr']['decay_steps'] is None: config['Optimizer']['lr']['decay_steps'] = config['Engine'][ 'max_steps'] config['Optimizer']['lr']['decay_steps'] *= config['Global'][ 'global_batch_size'] def process_data_configs(config): """ process data configs for hybrid parallel """ if 'Data' not in config.keys(): return cfg_global = config['Global'] cfg_data = config['Data'] mode_to_num_samples = { "Train": cfg_global['global_batch_size'] * config['Global']['max_steps'], "Eval": cfg_global['global_batch_size'] * (config['Global']['max_steps'] // config['Global']['eval_freq'] + 1) * config['Global']['eval_iters'], "Test": cfg_global['global_batch_size'] * config['Global']['test_iters'], } for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[ mode] def process_inference_configs(config): """ process inference configs for hybrid parallel """ if 'Inference' not in config.keys(): return configs = config['Inference'] if configs['model_dir'] is None: configs['model_dir'] = config['Global']['save_load']['output_dir'] if configs['mp_degree'] is None: configs['mp_degree'] = config['Distributed']['mp_degree'] def process_configs(config): process_data_configs(config) process_model_configs(config) process_optim_configs(config) process_inference_configs(config) return config ================================================ FILE: examples/transformer/utils/qat.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from ppfleetx.distributed.apis import io from ppfleetx.utils.compression_helper import prune_model, quant_model def compress_model(config, model, input_spec): quanter, quant_configs = None, None prune_configs, compress_configs = None, None if 'Compress' in config: compress_configs = config['Compress'] if "Prune" in compress_configs: prune_configs = compress_configs["Prune"] if "Quantization" in compress_configs: quant_configs = compress_configs["Quantization"] # Load pretrained model before compression if 'pretrained' in compress_configs and compress_configs[ 'pretrained'] is not None: ckpt_dir = compress_configs['pretrained'] io.load( ckpt_dir, model, optimizer=None, mode='quant', load_recovery=None) # Avoid loading again config.Global.save_load.ckpt_dir = None if prune_configs is not None and prune_configs.enable: prune_model(model, prune_configs, input_spec) # NOTE(minghaoBD): We haven't fully tested Prune+Quantization, so an "else if" is put here for separation. elif quant_configs is not None and quant_configs.enable: model, quanter = quant_model(model, quant_configs) return model, quanter ================================================ FILE: ppfleetx/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml ================================================ _base_: ./imagen_base.yaml Global: global_batch_size: local_batch_size: 1 micro_batch_size: 1 Model: name: imagen_397M_text2im_64 text_encoder_name: projects/imagen/t5/t5-11b text_embed_dim: 1024 timesteps: 1000 channels: 3 cond_drop_prob: 0.1 noise_schedules: cosine pred_objectives: noise lowres_noise_schedule: linear lowres_sample_noise_level: 0.2 per_sample_random_aug_noise_level: False condition_on_text: True auto_normalize_img: True p2_loss_weight_gamma: 0.5 dynamic_thresholding: True, dynamic_thresholding_percentile: 0.95 only_train_unet_number: 1 use_recompute: False recompute_granularity: Data: Train: dataset: name: ImagenDataset input_path: ./projects/imagen/filelist/laion_400M/train shuffle: True image_format: base64 image_size: 64 text_max_len: 128 filter_image_resolution: 64 loader: num_workers: 8 shuffle: True batch_size: 16 drop_last: True collate_fn: imagen_collate_fn Loss: name: mse_loss p2_loss_weight_k: 1.0 Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/multimodal/imagen/imagen_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 2500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 10 eval_freq: 10000000 eval_iters: 10000000 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 10000 output_dir: ./output ckpt_dir: Model: module: "ImagenModule" name: "Imagen" fused_linear: False # data loader for train Data: Train: dataset: name: ImagenDataset input_path: ./projects/imagen/filelist/laion_400M/train shuffle: True image_format: base64 image_size: 64 text_max_len: 128 filter_image_resolution: 64 loader: num_workers: 8 shuffle: True batch_size: 16 drop_last: True collate_fn: imagen_collate_fn Fused: tensor_fusion: False Optimizer: name: Adam weight_decay: 0. beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 2500000 warmup_rate: 0.025 max_lr: 1.0e-4 min_lr: 0.0 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Inference: model_dir: ./output mp_degree: 1 ================================================ FILE: ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml ================================================ _base_: ./imagen_base.yaml Global: global_batch_size: local_batch_size: 1 micro_batch_size: 1 Model: name: imagen_SR1024 text_encoder_name: None text_embed_dim: timesteps: 1000 channels: 3 cond_drop_prob: 0.1 noise_schedules: cosine pred_objectives: noise lowres_cond: True lowres_noise_schedule: linear lowres_sample_noise_level: 0.2 per_sample_random_aug_noise_level: False condition_on_text: False auto_normalize_img: True p2_loss_weight_gamma: 0.5 dynamic_thresholding: True, dynamic_thresholding_percentile: 0.95 only_train_unet_number: 1 is_sr: True use_recompute: True recompute_granularity: Engine: max_steps: 2500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 10 eval_freq: 10000000 eval_iters: 10000000 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] fp16_dtype: "bfloat16" save_load: save_steps: 10000 output_dir: ./output ckpt_dir: Data: Train: dataset: name: ImagenDataset input_path: ./projects/imagen/filelist/laion_400M/train shuffle: True image_format: base64 image_size: 1024 text_max_len: 128 filter_image_resolution: 1024 sr: True loader: num_workers: 8 shuffle: True batch_size: 1 drop_last: True collate_fn: imagen_collate_fn Loss: name: mse_loss p2_loss_weight_k: 1.0 Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml ================================================ _base_: ./imagen_base.yaml Global: global_batch_size: local_batch_size: 1 micro_batch_size: 1 Model: name: imagen_SR256 text_encoder_name: None # We do not use text conditoin during training. text_embed_dim: timesteps: 1000 channels: 3 cond_drop_prob: 0.1 noise_schedules: cosine pred_objectives: noise lowres_cond: True lowres_noise_schedule: linear lowres_sample_noise_level: 0.2 per_sample_random_aug_noise_level: False condition_on_text: False auto_normalize_img: True p2_loss_weight_gamma: 0.5 dynamic_thresholding: True, dynamic_thresholding_percentile: 0.95 only_train_unet_number: 1 is_sr: True use_recompute: True recompute_granularity: Data: Train: dataset: name: ImagenDataset input_path: ./projects/imagen/filelist/laion_400M/train shuffle: True image_format: base64 image_size: 256 text_max_len: 128 filter_image_resolution: 256 sr: True loader: num_workers: 8 shuffle: True batch_size: 6 drop_last: True collate_fn: imagen_collate_fn Loss: name: mse_loss p2_loss_weight_k: 1.0 Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml ================================================ _base_: ./imagen_base.yaml Global: global_batch_size: local_batch_size: 1 micro_batch_size: 1 Model: name: imagen_text2im_64_debertav2 text_encoder_name: projects/imagen/cache/deberta-v-xxlarge text_embed_dim: 1536 timesteps: 1000 channels: 3 cond_drop_prob: 0.1 noise_schedules: cosine pred_objectives: noise lowres_noise_schedule: linear lowres_sample_noise_level: 0.2 per_sample_random_aug_noise_level: False condition_on_text: True auto_normalize_img: True p2_loss_weight_gamma: 0.5 dynamic_thresholding: True, dynamic_thresholding_percentile: 0.95 only_train_unet_number: 1 use_recompute: False recompute_granularity: Data: Train: dataset: name: ImagenDataset input_path: ./projects/imagen/filelist/laion_400M/train shuffle: True image_format: base64 image_size: 64 text_max_len: 128 filter_image_resolution: 64 loader: num_workers: 8 shuffle: True batch_size: 8 drop_last: True collate_fn: imagen_collate_fn Loss: name: mse_loss p2_loss_weight_k: 1.0 Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml ================================================ _base_: ./imagen_base.yaml Global: global_batch_size: local_batch_size: 1 micro_batch_size: 1 Model: name: imagen_text2im_64 text_encoder_name: projects/imagen/t5/t5-11b text_embed_dim: 1024 timesteps: 1000 channels: 3 cond_drop_prob: 0.1 noise_schedules: cosine pred_objectives: noise lowres_noise_schedule: linear lowres_sample_noise_level: 0.2 per_sample_random_aug_noise_level: False condition_on_text: True auto_normalize_img: True p2_loss_weight_gamma: 0.5 dynamic_thresholding: True, dynamic_thresholding_percentile: 0.95 only_train_unet_number: 1 use_recompute: True recompute_granularity: Data: Train: dataset: name: ImagenDataset input_path: ./projects/imagen/filelist/laion_400M/train shuffle: True image_format: base64 image_size: 64 text_max_len: 128 filter_image_resolution: 64 loader: num_workers: 8 shuffle: True batch_size: 8 drop_last: True collate_fn: imagen_collate_fn Loss: name: mse_loss p2_loss_weight_k: 1.0 Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml ================================================ _base_: ./finetune_ernie_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 40000 hidden_size: 1024 num_hidden_layers: 24 num_attention_heads: 16 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: False Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/ernie/auto/finetune_ernie_base.yaml ================================================ Global: device: gpu seed: 1024 binary_head: True global_batch_size: local_batch_size: 16 micro_batch_size: 16 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 1 eval_freq: 500000 eval_iters: 10 test_iters: -1 mix_precision: level: scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 50000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "ErnieSeqClsModuleAuto" name: "Ernie" hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 2 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: False use_recompute: False Data: Train: collate_fn: name: DataCollatorWithPadding dataset: name: ErnieSeqClsDataset dataset_type: chnsenticorp_v2 tokenizer_type: ernie-1.0-base-zh-cw max_seq_len: 512 Eval: collate_fn: name: DataCollatorWithPadding dataset: name: ErnieSeqClsDataset dataset_type: chnsenticorp_v2 tokenizer_type: ernie-1.0-base-zh-cw max_seq_len: 512 Optimizer: name: AdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 990000 warmup_rate: 0.01 max_lr: 0.0001 min_lr: 5e-05 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 ================================================ FILE: ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml ================================================ Global: device: gpu seed: 1024 binary_head: True global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 1 eval_freq: 500000 eval_iters: 10 test_iters: -1 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 50000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "ErnieModuleAuto" name: "Ernie" hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 2 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: False use_recompute: False Data: Train: sample_split: 4 collate_fn: name: ErnieCollateData micro_batch_size: dataset: name: ErnieDataset input_dir: ./data tokenizer_type: ernie-1.0-base-zh-cw split: [949, 50, 1] mode: Train max_seq_length: 512 masked_lm_prob: 0.15 short_seq_prob: 0.1 seed: 1024 share_folder: False favor_longer_ngram: False max_ngrams: 3 Eval: sample_split: 4 collate_fn: name: ErnieCollateData micro_batch_size: 1 dataset: name: ErnieDataset input_dir: ./data tokenizer_type: ernie-1.0-base-zh-cw split: [949, 50, 1] mode: Eval max_seq_length: 512 masked_lm_prob: 0.15 short_seq_prob: 0.1 seed: 1024 share_folder: False favor_longer_ngram: False max_ngrams: 3 Optimizer: name: AdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 990000 warmup_rate: 0.01 max_lr: 0.0001 min_lr: 0.00001 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 ================================================ FILE: ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base_345M_single_card.yaml ================================================ _base_: ./pretrain_ernie_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 40000 hidden_size: 1024 num_hidden_layers: 24 num_attention_heads: 16 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: False Data: Train: dataset: tokenizer_type: ernie-1.0-base-zh-cw Eval: dataset: tokenizer_type: ernie-1.0-base-zh-cw Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml ================================================ _base_: ./finetune_ernie_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 40000 hidden_size: 1024 num_hidden_layers: 24 num_attention_heads: 16 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: False Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml ================================================ Global: device: gpu seed: 1024 binary_head: True global_batch_size: local_batch_size: 16 micro_batch_size: 16 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 1 eval_freq: 500000 eval_iters: 10 test_iters: -1 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 50000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "ErnieSeqClsModule" name: "Ernie" hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 2 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: False use_recompute: False Data: Train: dataset: name: ErnieSeqClsDataset dataset_type: chnsenticorp_v2 tokenizer_type: ernie-1.0-base-zh-cw max_seq_len: 512 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 0 return_list: False collate_fn: name: DataCollatorWithPadding Eval: dataset: name: ErnieSeqClsDataset dataset_type: chnsenticorp_v2 tokenizer_type: ernie-1.0-base-zh-cw max_seq_len: 512 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 0 return_list: False collate_fn: name: DataCollatorWithPadding Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 990000 warmup_rate: 0.01 max_lr: 5e-05 min_lr: 1e-05 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False ================================================ FILE: ppfleetx/configs/nlp/ernie/inference_ernie_345M_single_card.yaml ================================================ _base_: ./finetune_ernie_345M_single_card.yaml Inference: model_dir: ./output mp_degree: 1 Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml ================================================ Global: device: gpu seed: 1024 binary_head: True global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 1 eval_freq: 500000 eval_iters: 10 test_iters: -1 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 50000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "ErnieModule" name: "Ernie" hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 2 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: False use_recompute: False Data: Train: dataset: name: ErnieDataset input_dir: ./data tokenizer_type: ernie-1.0-base-zh-cw split: [949, 50, 1] mode: Train max_seq_length: 512 masked_lm_prob: 0.15 short_seq_prob: 0.1 seed: 1024 share_folder: False favor_longer_ngram: False max_ngrams: 3 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 0 return_list: False collate_fn: name: ErnieCollateData micro_batch_size: Eval: dataset: name: ErnieDataset input_dir: ./data tokenizer_type: ernie-1.0-base-zh-cw split: [949, 50, 1] mode: Eval max_seq_length: 512 masked_lm_prob: 0.15 short_seq_prob: 0.1 seed: 1024 share_folder: False favor_longer_ngram: False max_ngrams: 3 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: name: ErnieCollateData micro_batch_size: 1 Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 990000 warmup_rate: 0.01 max_lr: 0.0001 min_lr: 0.00001 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Inference: model_dir: ./output mp_degree: 1 ================================================ FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml ================================================ _base_: ./pretrain_ernie_base.yaml Global: global_batch_size: local_batch_size: 512 micro_batch_size: 1 Model: vocab_size: 40000 hidden_size: 12288 num_hidden_layers: 96 num_attention_heads: 96 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: True Data: Train: dataset: tokenizer_type: ernie-1.0-base-zh-cw Eval: dataset: tokenizer_type: ernie-1.0-base-zh-cw Distributed: dp_degree: 1 mp_degree: 8 pp_degree: 16 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml ================================================ _base_: ./pretrain_ernie_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 40000 hidden_size: 1024 num_hidden_layers: 24 num_attention_heads: 16 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: False Data: Train: dataset: tokenizer_type: ernie-1.0-base-zh-cw Eval: dataset: tokenizer_type: ernie-1.0-base-zh-cw Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml ================================================ _base_: ./pretrain_ernie_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 1 Model: vocab_size: 40000 hidden_size: 768 num_hidden_layers: 8 num_attention_heads: 16 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: False Data: Train: dataset: tokenizer_type: ernie-1.0-base-zh-cw Eval: dataset: tokenizer_type: ernie-1.0-base-zh-cw Distributed: dp_degree: 2 mp_degree: 2 pp_degree: 2 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_base_6.7B_sharding16.yaml ================================================ _base_: ./pretrain_ernie_base.yaml Global: global_batch_size: local_batch_size: 512 micro_batch_size: 1 Model: vocab_size: 40000 hidden_size: 4096 num_hidden_layers: 32 num_attention_heads: 32 intermediate_size: hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 4 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: True use_recompute: True Data: Train: dataset: tokenizer_type: ernie-1.0-base-zh-cw Eval: dataset: tokenizer_type: ernie-1.0-base-zh-cw Distributed: dp_degree: 1 mp_degree: 8 pp_degree: 16 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml ================================================ _base_: ./pretrain_ernie_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 18000 hidden_size: 1024 num_hidden_layers: 24 num_attention_heads: 16 intermediate_size: 3072 hidden_act: "relu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 2 initializer_range: 0.02 pad_token_id: 0 use_recompute: False Data: Train: dataset: tokenizer_type: ernie-1.0-large-zh-cw Eval: dataset: tokenizer_type: ernie-1.0-large-zh-cw Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False ================================================ FILE: ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml ================================================ Global: device: gpu seed: 1024 binary_head: True global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: 1 logging_freq: 1 eval_freq: 500000 eval_iters: 10 test_iters: -1 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 50000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "ErnieModule" name: "Ernie" hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 hidden_act: "gelu" hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 512 type_vocab_size: 2 initializer_range: 0.02 pad_token_id: 0 task_type_vocab_size: 3 task_id: 0 use_task_id: False use_recompute: False Data: Train: dataset: name: ErnieDataset input_dir: ./data tokenizer_type: ernie-1.0-base-zh-cw split: [949, 50, 1] mode: Train max_seq_length: 512 masked_lm_prob: 0.15 short_seq_prob: 0.1 seed: 1024 share_folder: False favor_longer_ngram: False max_ngrams: 3 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 0 return_list: False collate_fn: name: ErnieCollateData micro_batch_size: Eval: dataset: name: ErnieDataset input_dir: ./data tokenizer_type: ernie-1.0-base-zh-cw split: [949, 50, 1] mode: Eval max_seq_length: 512 masked_lm_prob: 0.15 short_seq_prob: 0.1 seed: 1024 share_folder: False favor_longer_ngram: False max_ngrams: 3 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: name: ErnieCollateData micro_batch_size: 1 Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 990000 warmup_rate: 0.01 max_lr: 0.0001 min_lr: 0.00001 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Inference: model_dir: ./output mp_degree: 1 Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/export_gpt_fp16_single_card.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: -1 num_train_epochs: -1 eval_freq: -1 eval_iters: -1 test_iters: -1 mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False save_load: output_dir: ckpt_dir: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_175B_mp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False Generation: top_k: 1 top_p: 0.9 temperature: 1.0 min_dec_len: 1 max_dec_len: 8 use_topp_sampling: True num_return_sequences: 1 decode_strategy: "sampling" use_topp_sampling: True early_finish: True Model: module: GPTGenerationModuleAuto vocab_size: 51200 hidden_size: 12288 num_layers: 96 num_attention_heads: 96 ffn_hidden_size: 49152 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 1 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 8 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_mp2.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" use_topp_sampling: True early_finish: True Model: module: GPTGenerationModuleAuto vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 2 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False Generation: top_k: 0 top_p: 0.9 use_topp_sampling: True inference: True temperature: 1.0 min_dec_len: 8 max_dec_len: 8 num_return_sequences: 1 decode_strategy: "sampling" early_finish: True Model: module: GPTGenerationModuleAuto vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False Generation: top_k: 0 top_p: 0.9 use_topp_sampling: True inference: True temperature: 1.0 min_dec_len: 8 max_dec_len: 8 num_return_sequences: 1 decode_strategy: "sampling" early_finish: True Model: module: GPTGenerationModuleAuto vocab_size: 51200 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: 16384 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 fuse_attn_qkv: True use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 8 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8_tuning.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 fuse_attn_qkv: True use_recompute: True recompute_granularity: "full_attn" no_recompute_layers: Distributed: dp_degree: 8 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 Tuning: enable: True tuning_recompute: True profile_start_step: 1 profile_end_step: 5 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 fuse_attn_qkv: True use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 fuse_attn_qkv: True use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 16 sharding_stage: 2 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 500000 num_train_epochs: 1 eval_freq: 1 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False save_load: output_dir: ./output ckpt_dir: Model: module: "GPTModuleAuto" name: "GPT" fuse_attn_qkv: False Data: Train: collate_fn: gpt_collate_fn sample_split: 2 dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 Eval: collate_fn: gpt_collate_fn sample_split: 2 dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 Optimizer: name: AdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 ================================================ FILE: ppfleetx/configs/nlp/gpt/auto/qat_generation_gpt_345M_mp2.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" Model: module: GPTGenerationModuleAuto vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 2 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 Quantization: enable: True channel_wise_abs_max: False weight_bits: 8 activation_bits: 8 onnx_format: True ================================================ FILE: ppfleetx/configs/nlp/gpt/eval_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Model: module: GPTEvalModule Offline_Eval: eval_path: ./wikitext-103/wiki.valid.tokens cloze_eval: False overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: ppfleetx/configs/nlp/gpt/eval_pruned_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Engine: save_load: ckpt_dir: Model: module: GPTEvalModule hidden_dropout_prob: 0.0 attention_probs_dropout_prob: 0.0 Compress: Prune: enable: True criterion: l1_norm ratio: 0.125 Offline_Eval: eval_path: ./lambada_test.jsonl cloze_eval: True overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: ppfleetx/configs/nlp/gpt/eval_qat_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Model: module: GPTEvalModule Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True skip_tensor_map: block_3: ['linear2'] block_5: ['linear1'] block_6: ['linear2'] block_7: ['linear2'] block_10: ['linear2'] block_20: ['linear2'] block_21: ['linear2'] Offline_Eval: eval_path: ./wikitext-103/wiki.valid.tokens cloze_eval: False overlapping_eval: 32 batch_size: 8 max_seq_len: 1024 logging_freq: 10 ================================================ FILE: ppfleetx/configs/nlp/gpt/export_qat_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml ================================================ _base_: ./finetune_gpt_base.yaml Global: global_batch_size: local_batch_size: 32 micro_batch_size: 32 Engine: run_mode: epoch num_train_epochs: 3 accumulate_steps: logging_freq: 10 eval_freq: 1 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "reduce_mean"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "GPTFinetuneModule" name: "GPT" num_classes: 2 pretrained: './ckpt/PaddleFleetX_GPT_345M_220826/model' fuse_attn_qkv: True fused_linear: False vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: loss: train: name: 'CrossEntropyLoss' eval: name: 'CrossEntropyLoss' metric: eval: name: 'Accuracy' Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Optimizer: name: FusedAdamW weight_decay: 0.0 beta1: 0.9 beta2: 0.999 epsilon: 1e-6 multi_precision: True lr: name: LinearDecayWithWarmup warmup: 0.1 learning_rate: 2e-5 tensor_fusion: False Data: Train: dataset: name: SST2 root: ./dataset/SST-2/ split: 'train' max_length: 128 sampler: name: DistributedBatchSampler batch_size: 32 shuffle: True drop_last: True loader: num_workers: 4 return_list: False Eval: dataset: name: SST2 root: ./dataset/SST-2/ split: 'dev' max_length: 128 sampler: name: DistributedBatchSampler batch_size: 32 shuffle: False drop_last: False loader: num_workers: 4 return_list: False ================================================ FILE: ppfleetx/configs/nlp/gpt/finetune_gpt_base.yaml ================================================ Global: device: gpu seed: 42 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: run_mode: epoch max_steps: -1 eval_freq: 1 eval_iters: -1 test_iters: -1 save_load: save_steps: -1 save_epoch: 1 output_dir: ./output ckpt_dir: Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Model: use_flash_attn: False ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_gpt_345M_dp8.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Model: module: GPTGenerationModule Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_gpt_345M_mp1.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: level: Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" Model: module: GPTGenerationModuleAuto vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Model: module: GPTGenerationModule Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_gpt_6.7B_single_mp1.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Engine: mix_precision: level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "where"] custom_white_list: ["lookup_table", "lookup_table_v2"] use_fp16_guard: False Generation: top_k: 0 top_p: 0.9 use_topp_sampling: True inference: True temperature: 1.0 min_dec_len: 8 max_dec_len: 8 num_return_sequences: 1 decode_strategy: "sampling" Model: module: GPTGenerationModuleAuto vocab_size: 51200 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: 16384 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False fuse_attn_qkv: True Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_pruned_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Model: module: GPTGenerationModule Compress: Prune: enable: True criterion: l1_norm ratio: 0.125 Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_qat_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_345M_single_card.yaml Model: module: GPTGenerationModule Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" use_topp_sampling: True inference: True Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: ppfleetx/configs/nlp/gpt/generation_qat_gpt_6.7B_single_card.yaml ================================================ _base_: ./pretrain_gpt_6.7B_single_card.yaml Model: module: GPTGenerationModule Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" use_topp_sampling: True inference: True Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: ppfleetx/configs/nlp/gpt/inference_gpt_345M_dp8.yaml ================================================ _base_: ./generation_gpt_345M_dp8.yaml Inference: model_dir: ./output mp_degree: 1 Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Data: Test: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn ================================================ FILE: ppfleetx/configs/nlp/gpt/inference_gpt_345M_single_card.yaml ================================================ _base_: ./generation_gpt_345M_single_card.yaml Inference: model_dir: ./output mp_degree: 1 Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Data: Test: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 8 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_13B_dp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: seed: 1234 global_batch_size: 480 local_batch_size: micro_batch_size: 4 Engine: max_steps: 200000 eval_freq: 1000 eval_iters: 10 save_load: save_steps: 500 Model: vocab_size: 50432 hidden_size: 5120 num_layers: 40 num_attention_heads: 40 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 4096 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: 'full' no_recompute_layers: Data: Train: dataset: max_seq_len: 4096 Eval: dataset: max_seq_len: 4096 Distributed: dp_degree: mp_degree: 2 pp_degree: 8 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Optimizer: lr: name: CosineAnnealingWithWarmupDecay decay_steps: 160000 warmup_rate: 0.001 max_lr: 1.0e-4 min_lr: 1.0e-5 ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 1536 micro_batch_size: 1 Model: vocab_size: 51200 hidden_size: 12288 num_layers: 96 num_attention_heads: 96 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: 'core_attn' no_recompute_layers: virtual_pp_degree: 1 sequence_parallel: True fused_linear: True Distributed: dp_degree: mp_degree: 8 pp_degree: 16 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Engine: logging_freq: 10 Model: vocab_size: 50304 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 16 sharding_stage: 2 sharding_offload: False reduce_overlap: True broadcast_overlap: True Optimizer: tensor_fusion: True ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: 16384 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: module: "GPTModule" name: "GPT" vocab_size_divisible_unit: 128 fused_linear: False fuse_attn_qkv: True scale_qk_by_layer_num: True sequence_parallel: False use_flash_attn: False fused_softmax_with_triangular: True Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [969, 30, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Eval: dataset: name: GPTDataset input_dir: ./data/ split: [969, 30, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 use_increments: True grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Distributed: fuse_sequence_parallel_allreduce: False ================================================ FILE: ppfleetx/configs/nlp/gpt/pretrain_gpt_cn_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: name: "GPT-cn" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/gpt/prune_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Engine: save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: 4096 hidden_dropout_prob: 0.0 attention_probs_dropout_prob: 0.0 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: no_recompute_layers: Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False comm_overlap: False Optimizer: weight_decay: 0.0 lr: decay_steps: 90000 warmup_rate: 0.00 max_lr: 2.5e-5 min_lr: 5.0e-6 Compress: pretrained: Prune: enable: True criterion: l1_norm ratio: 0.125 ================================================ FILE: ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 1 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: fused_linear: True Distributed: dp_degree: mp_degree: 8 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True freeze_embedding: True skip_tensor_map: block_3: ['linear2'] block_5: ['linear1'] block_6: ['linear2'] block_7: ['linear2'] block_10: ['linear2'] block_20: ['linear2'] block_21: ['linear2'] ================================================ FILE: ppfleetx/configs/nlp/gpt/qat_gpt_345M_single_card.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: 8 local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True freeze_embedding: True skip_tensor_map: block_3: ['linear2'] block_5: ['linear1'] block_6: ['linear2'] block_7: ['linear2'] block_10: ['linear2'] block_20: ['linear2'] block_21: ['linear2'] ================================================ FILE: ppfleetx/configs/nlp/gpt/qat_gpt_6.7B_sharding16.yaml ================================================ _base_: ./pretrain_gpt_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Engine: logging_freq: 10 Model: vocab_size: 50304 hidden_size: 4096 num_layers: 32 num_attention_heads: 32 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: fused_linear: True Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 16 sharding_stage: 2 sharding_offload: False reduce_overlap: True broadcast_overlap: True Optimizer: tensor_fusion: True Compress: pretrained: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear'] onnx_format: True ================================================ FILE: ppfleetx/configs/nlp/moe/pretrain_moe_1.3B_dp8.yaml ================================================ _base_: ./pretrain_moe_base.yaml Global: global_batch_size: local_batch_size: 8 micro_batch_size: 8 Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: Distributed: dp_degree: 8 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ================================================ FILE: ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml ================================================ Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: balance_loss_weight: 1.0 Model: module: "MoEModule" name: "MoE" fused_linear: False fuse_attn_qkv: True sequence_parallel: False moe_configs: expert_mode: True gate: gshard top_k: 2 num_experts: 2 Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Eval: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn Optimizer: name: FusedAdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 grad_clip: name: "ClipGradForMOEByGlobalNorm" clip_norm: 1.0 tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Distributed: dp_degree: 1 mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 hcg: HybridCommGroupForMoE ================================================ FILE: ppfleetx/configs/vis/base.yaml ================================================ Global: device: gpu seed: 2021 global_batch_size: local_batch_size: 1 micro_batch_size: 1 flags: FLAGS_enable_cublas_tensor_op_math: True FLAGS_gemm_use_half_precision_compute_type: False Engine: run_mode: epoch max_steps: -1 eval_freq: 1 eval_iters: -1 test_iters: -1 save_load: save_steps: -1 save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Model: use_recompute: False Fused: tensor_fusion: False Profiler: enable: False scheduler: [1, 5] profiler_log: profiler_log detailed: False Inference: model_dir: ./output mp_degree: 1 ================================================ FILE: ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2022 Engine: run_mode: 'epoch' num_train_epochs: 100 eval_freq: 1 eval_iters: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "MOCOClsModule" model: base_encoder: name: "resnet50" with_pool: False num_classes: 0 # remove last classifier #pretrained: ./pretrained/mocov1/model pretrained: ./pretrained/mocov2/model base_classifier: name: "MoCoClassifier" with_pool: True num_features: 2048 num_classes: 1000 loss: train: name: 'CELoss' eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: Momentum momentum: 0.9 weight_decay: 0.0 lr: name: MultiStepDecay run_mode: epoch learning_rate: 30.0 gamma: 0.1 milestones: [60, 80] Data: Train: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ class_num: 1000 cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 224 interpolation: bilinear backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total bachsize 256 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ cls_label_path: ./dataset/ILSVRC2012/val_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - ResizeImage: resize_short: 256 interpolation: bilinear backend: pil - CenterCropImage: size: 224 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 64 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2022 Engine: run_mode: 'epoch' num_train_epochs: 200 eval_freq: -1 eval_iters: 0 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "MOCOModule" model: base_encoder: name: "resnet50" with_pool: False num_classes: 0 # remove last classifier base_classifier: name: "MoCoClassifier" with_pool: True num_features: 2048 num_classes: 128 momentum_encoder: name: "resnet50" with_pool: False num_classes: 0 # remove last classifier momentum_classifier: name: "MoCoClassifier" with_pool: True num_features: 2048 num_classes: 128 loss: train: name: 'CELoss' Optimizer: name: Momentum momentum: 0.9 weight_decay: 0.0001 lr: name: MultiStepDecay run_mode: epoch learning_rate: 0.03 gamma: 0.1 milestones: [120, 160] Data: Train: dataset: name: ContrativeLearningDataset root: ./dataset/ILSVRC2012/train transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 224 scale: [0.2, 1.0] interpolation: bicubic backend: pil - RandomGrayscale: p: 0.2 - ColorJitter: brightness: 0.4 contrast: 0.4 saturation: 0.4 hue: 0.4 - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total batchsize 256 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2022 Engine: run_mode: 'epoch' num_train_epochs: 200 eval_freq: -1 eval_iters: 0 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "MOCOModule" model: T: 0.2 base_encoder: name: "resnet50" with_pool: False num_classes: 0 # remove last classifier base_projector: name: "MoCoV2Projector" in_dim: 2048 out_dim: 2048 with_pool: True base_classifier: name: "MoCoClassifier" with_pool: False num_features: 2048 num_classes: 128 momentum_encoder: name: "resnet50" with_pool: False num_classes: 0 # remove last classifier momentum_projector: name: "MoCoV2Projector" in_dim: 2048 out_dim: 2048 with_pool: True momentum_classifier: name: "MoCoClassifier" with_pool: False num_features: 2048 num_classes: 128 loss: train: name: 'CELoss' Optimizer: name: Momentum momentum: 0.9 weight_decay: 0.0001 lr: name: CosineDecay run_mode: epoch update_unit: epoch learning_rate: 0.03 Data: Train: dataset: name: ContrativeLearningDataset root: ./dataset/ILSVRC2012/train transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 224 scale: [0.2, 1.0] interpolation: bicubic backend: pil - ColorJitter: brightness: 0.4 contrast: 0.4 saturation: 0.4 hue: 0.1 p: 0.8 - RandomGrayscale: p: 0.2 - GaussianBlur: sigma: [.1, 2.] p: 0.5 - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total batchsize 256 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml ================================================ Global: device: gpu seed: 2021 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False Engine: run_mode: 'epoch' num_train_epochs: 300 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: ./ckpt Model: use_recompute: False module: "GeneralClsModule" model: name: "ViT_base_patch16_224" class_num: 1000 drop_rate: 0.1 loss: train: name: 'ViTCELoss' epsilon: 0.0001 eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: AdamW weight_decay: 0.3 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: ViTLRScheduler learning_rate: 0.003 decay_type: cosine warmup_steps: 10000 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Inference: model_dir: ./output mp_degree: 1 TensorRT: max_batch_size: 1 workspace_size: 1<<30 min_subgraph_size: 3 precision: fp16 use_static: False use_calib_mode: False collect_shape: False shape_range_info_filename: ./shape.pbtxt ================================================ FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 300 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_base_patch16_224" class_num: 1000 drop_rate: 0.1 loss: train: name: 'ViTCELoss' epsilon: 0.0001 eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: AdamW weight_decay: 0.3 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: ViTLRScheduler learning_rate: 0.003 decay_type: cosine warmup_steps: 10000 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Data: Train: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ class_num: 1000 cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 224 scale: [0.05, 1.0] interpolation: bicubic backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 256 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ cls_label_path: ./dataset/ILSVRC2012/val_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - ResizeImage: resize_short: 256 interpolation: bicubic backend: pil - CenterCropImage: size: 224 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 256 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 8 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_base_patch16_384" class_num: 1000 drop_rate: 0.1 pretrained: prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224 finetune: True loss: train: name: 'CELoss' eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: Momentum weight_decay: 0.0001 momentum: 0.9 lr: name: ViTLRScheduler learning_rate: 0.004 decay_type: cosine warmup_steps: 500 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 0.35 Data: Train: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ class_num: 1000 cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 384 scale: [0.05, 1.0] interpolation: bilinear backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total batchsize 512 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ cls_label_path: ./dataset/ILSVRC2012/val_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - ResizeImage: size: 384 interpolation: bilinear backend: pil - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 256 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 103 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_base_patch16_384" class_num: 10 drop_rate: 0.1 pretrained: prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224 finetune: True loss: train: name: 'CELoss' eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: Momentum weight_decay: 0.0001 momentum: 0.9 lr: name: ViTLRScheduler learning_rate: 0.004 decay_type: cosine warmup_steps: 500 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 0.35 Data: Train: dataset: name: CIFAR10 root: ./dataset/cifar-10-batches-py/ mode: train transform_ops: - RandCropImage: size: 384 scale: [0.05, 1.0] interpolation: bilinear backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 64 # total batchsize 512 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: CIFAR10 root: ./dataset/cifar-10-batches-py/ mode: test transform_ops: - ResizeImage: size: 384 interpolation: bilinear backend: pil - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 64 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True Compress: Quantization: enable: True weight_quantize_type: 'abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 onnx_format: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 8 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_base_patch16_384" class_num: 1000 drop_rate: 0.1 pretrained: prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-384 finetune: True loss: train: name: 'CELoss' eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: Momentum weight_decay: 0.0001 momentum: 0.9 lr: name: ViTLRScheduler learning_rate: 0.004 decay_type: cosine warmup_steps: 500 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 0.35 Data: Train: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ class_num: 1000 cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 384 scale: [0.05, 1.0] interpolation: bilinear backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total batchsize 512 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ cls_label_path: ./dataset/ILSVRC2012/val_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - ResizeImage: size: 384 interpolation: bilinear backend: pil - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 256 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True Compress: Quantization: enable: True weight_quantize_type: 'channel_wise_abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 onnx_format: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 8 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_large_patch16_384" class_num: 1000 drop_rate: 0.1 pretrained: prefix_path: ./pretrained/vit/imagenet21k-ViT-L_16 finetune: True loss: train: name: 'CELoss' eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: Momentum weight_decay: 0.0001 momentum: 0.9 lr: name: ViTLRScheduler learning_rate: 0.03 decay_type: cosine warmup_steps: 500 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Data: Train: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ class_num: 1000 cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 384 scale: [0.05, 1.0] interpolation: bilinear backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total batchsize 512 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ cls_label_path: ./dataset/ILSVRC2012/val_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - ResizeImage: size: 384 interpolation: bilinear backend: pil - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 64 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 8 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_large_patch16_384" class_num: 1000 drop_rate: 0.1 pretrained: prefix_path: ./pretrained/vit/imagenet21k-ViT-L_16 finetune: True loss: train: name: 'CELoss' eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: Momentum weight_decay: 0.0001 momentum: 0.9 lr: name: ViTLRScheduler learning_rate: 0.03 decay_type: cosine warmup_steps: 500 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Data: Train: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ class_num: 1000 cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - RandCropImage: size: 384 scale: [0.05, 1.0] interpolation: bilinear backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 32 # total batchsize 512 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: GeneralClsDataset image_root: ./dataset/ILSVRC2012/ cls_label_path: ./dataset/ILSVRC2012/val_list.txt transform_ops: - DecodeImage: to_rgb: True channel_first: False - ResizeImage: size: 384 interpolation: bilinear backend: pil - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 64 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True Compress: Quantization: enable: True weight_quantize_type: 'channel_wise_abs_max' activation_quantize_type: 'moving_average_abs_max' activation_preprocess_type: 'PACT' weight_bits: 8 activation_bits: 8 onnx_format: True ================================================ FILE: ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml ================================================ _base_: ../base.yaml Global: device: gpu seed: 2021 Engine: run_mode: 'epoch' num_train_epochs: 1 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModule" model: name: "ViT_tiny_patch16_224" class_num: 10 drop_rate: 0.1 loss: train: name: 'ViTCELoss' epsilon: 0.0001 eval: name: 'CELoss' metric: train: name: 'TopkAcc' topk: [1, 5] eval: name: 'TopkAcc' topk: [1, 5] Optimizer: name: AdamW weight_decay: 0.3 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: ViTLRScheduler learning_rate: 0.003 decay_type: cosine warmup_steps: 10000 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Data: Train: dataset: name: CIFAR10 root: ./dataset/cifar-10-batches-py/ mode: train transform_ops: - RandCropImage: size: 224 scale: [0.05, 1.0] interpolation: bicubic backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 256 drop_last: True shuffle: True loader: num_workers: 8 use_shared_memory: True Eval: dataset: name: CIFAR10 root: ./dataset/cifar-10-batches-py/ mode: test transform_ops: - ResizeImage: resize_short: 256 interpolation: bicubic backend: pil - CenterCropImage: size: 224 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: sampler: name: DistributedBatchSampler batch_size: 256 drop_last: False shuffle: False loader: num_workers: 8 use_shared_memory: True ================================================ FILE: ppfleetx/configs/vis/vit/auto/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml ================================================ _base_: ./base.yaml Global: device: gpu seed: 2021 local_batch_size: 256 micro_batch_size: 256 Engine: num_train_epochs: 1 eval_freq: 1 accumulate_steps: 1 logging_freq: 10 mix_precision: level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] save_load: save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: Model: module: "GeneralClsModuleAuto" model: name: "ViT_tiny_patch16_224" class_num: 10 drop_rate: 0.1 loss: name: 'ViTCELoss' metric: name: 'TopkAcc' topk: [1, 5] Optimizer: name: AdamW weight_decay: 0.3 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: ViTLRScheduler learning_rate: 0.003 decay_type: cosine warmup_steps: 10000 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 Data: Train: sample_split: 1 dataset: name: CIFAR10 root: ./dataset/cifar-10-batches-py/ mode: train transform_ops: - RandCropImage: size: 224 scale: [0.05, 1.0] interpolation: bicubic backend: pil - RandFlipImage: flip_code: 1 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: Eval: sample_split: 1 dataset: name: CIFAR10 root: ./dataset/cifar-10-batches-py/ mode: test transform_ops: - ResizeImage: resize_short: 256 interpolation: bicubic backend: pil - CenterCropImage: size: 224 - NormalizeImage: scale: 1.0/255.0 mean: [0.5, 0.5, 0.5] std: [0.5, 0.5, 0.5] order: '' - ToCHWImage: ================================================ FILE: ppfleetx/configs/vis/vit/auto/base.yaml ================================================ Global: device: gpu seed: 2021 global_batch_size: local_batch_size: 1 micro_batch_size: 1 Engine: run_mode: epoch max_steps: -1 eval_freq: 1 eval_iters: -1 test_iters: -1 save_load: save_steps: -1 save_epoch: 1 output_dir: ./output ckpt_dir: Distributed: dp_degree: mp_degree: 1 pp_degree: 1 sharding: sharding_degree: 1 sharding_stage: 1 Model: use_recompute: False ================================================ FILE: ppfleetx/core/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .engine import * from .module import * ================================================ FILE: ppfleetx/core/engine/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .basic_engine import BasicEngine from .inference_engine import InferenceEngine, TensorRTConfig from .eager_engine import EagerEngine from .auto_engine import AutoEngine ================================================ FILE: ppfleetx/core/engine/auto_engine.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time import sys import logging import numpy as np import paddle import paddle.nn as nn import paddle.distributed as dist import paddle.fluid.core as core from paddle.distributed.fleet import auto from paddle.optimizer.lr import LRScheduler from ppfleetx.utils.log import logger from ppfleetx.core.engine import BasicEngine from ppfleetx.core.module import BasicModule from ppfleetx.utils.version import version_check from ppfleetx.data import utils from ppfleetx.optims import build_lr_scheduler, build_optimizer logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class AutoEngine(BasicEngine): def __init__(self, configs, module=None, mode='train'): super().__init__() version_check() model = None loss_fn = None if module and not isinstance(module, BasicModule): raise TypeError( "'module' must be sub classes of `BasicModule`, but got: {model.__class__.__name__}." ) if module: if module.model and not isinstance( module.model, nn.Layer) and not callable(module.model): raise TypeError( "'model' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.model.__class__.__name__}." ) model = module.model if mode == 'train': if module.loss_fn and not isinstance( module.loss_fn, nn.Layer) and not callable(module.loss_fn): raise TypeError( "'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}." ) else: module.loss_fn = None module.model.eval() loss_fn = module.loss_fn self._module = module # lr_scheduler and optimizer lr = build_lr_scheduler( configs.Optimizer.lr) if mode == "train" else None optimizer = build_optimizer(configs.Optimizer, model, lr) if mode == "train" else None # engine configs self._configs = configs['Engine'] self._max_steps = self._configs['max_steps'] self._verbose = self._configs["verbose"] self._eval_freq = self._configs['eval_freq'] self._eval_iters = self._configs['eval_iters'] self._test_iters = self._configs['test_iters'] self._logging_freq = self._configs['logging_freq'] self._num_train_epochs = self._configs['num_train_epochs'] self._strategy = self._configs['strategy'] # save & load self._save_steps = self._configs['save_load']['save_steps'] self._save_epoch = self._configs['save_load']['save_epoch'] self._output_dir = self._configs['save_load']['output_dir'] self._ckpt_dir = self._configs['save_load']['ckpt_dir'] # engine fit inputs self.batch_size = configs['Global']['global_batch_size'] # init engine self._auto_engine = auto.Engine( model, loss_fn, optimizer, strategy=self._strategy) def fit(self, epoch=1, train_dataset=None, valid_dataset=None): train_sample_split = train_dataset.sample_split if train_dataset else None valid_sample_split = valid_dataset.sample_split if valid_dataset else None self._auto_engine.fit(train_data=train_dataset, valid_data=valid_dataset, train_sample_split=train_sample_split, valid_sample_split=valid_sample_split, epochs=self._num_train_epochs, batch_size=self.batch_size, steps_per_epoch=self._max_steps, valid_steps=self._eval_iters, valid_freq=self._eval_freq, collate_fn=train_dataset.collate_fn, log_freq=self._logging_freq, save_dir=self._output_dir, save_freq=self._save_steps, verbose=self._verbose) def evaluate(self, valid_dataset=None): self._auto_engine.evaluate( valid_data=valid_dataset, valid_sample_split=valid_dataset.sample_split, batch_size=self.batch_size, steps=self._max_steps, collate_fn=valid_dataset.collate_fn) def predict(self, test_dataset=None): self._auto_engine.predict( test_data=test_dataset, test_sample_split=test_dataset.sample_split, batch_size=self.batch_size, steps=self._max_steps, collate_fn=test_dataset.collate_fn) def export(self): self._auto_engine.prepare(self._module.input_spec(), mode="predict") self.save(training=False) def tune(self, tune_dataset=None): self._auto_engine._tune( tune_dataset, tune_sample_split=tune_dataset.sample_split, batch_size=self.batch_size) def save(self, training=True): if self._output_dir and isinstance(self._output_dir, str): path = os.path.join(self._output_dir, "auto") self._auto_engine.save(path, training=training) else: raise TypeError("`save` requires a valid value of `output_dir`.") def load(self): if self._ckpt_dir and isinstance(self._ckpt_dir, str): self._auto_engine.load(self._ckpt_dir) else: logger.warning("`load` requires a valid value of `ckpt_dir`.") def export_from_prog(self): paddle.enable_static() if not (self._ckpt_dir and isinstance(self._ckpt_dir, str)): raise ValueError("invalid ckpt_dir.") exe = paddle.static.Executor() [inference_program, feed_target_names, fetch_targets] = paddle.static.load_inference_model( path_prefix=self._ckpt_dir, executor=exe) feed_targets = [ inference_program.global_block().var(name) for name in feed_target_names ] self._auto_engine.prepare( inputs=feed_targets, main_program=inference_program, startup_program=paddle.static.Program(), mode="predict") model_dict = self._auto_engine.main_program.state_dict() for param in list( filter(lambda var: var.persistable, self._auto_engine.main_program.list_vars())): if param.type in [ core.VarDesc.VarType.FEED_MINIBATCH, core.VarDesc.VarType.FETCH_LIST ]: continue if param.dtype != model_dict[param.name]._dtype(): model_dict[param.name] = model_dict[param.name]._as_type( param.dtype) self._auto_engine.main_program.set_state_dict(model_dict) path = os.path.join(self._output_dir, "auto_dist0") paddle.static.save_inference_model( path, feed_targets, fetch_targets, exe, program=self._auto_engine.main_program, ) paddle.disable_static() ================================================ FILE: ppfleetx/core/engine/basic_engine.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. class BasicEngine: """ """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def fit(self, *args, **kwargs): raise NotImplementedError def evaluate(self, *args, **kwargs): raise NotImplementedError def predict(self, *args, **kwargs): raise NotImplementedError def save(self, *args, **kwargs): raise NotImplementedError def load(self, *args, **kwargs): raise NotImplementedError def inference(self, *args, **kwargs): raise NotImplementedError ================================================ FILE: ppfleetx/core/engine/eager_engine.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time import sys import logging from tokenize import group import paddle import paddle.nn as nn import paddle.distributed as dist import paddle.distributed.fleet as fleet from paddle.optimizer.lr import LRScheduler from paddle.distributed.parallel import sync_params_buffers from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from paddle.profiler import SummaryView from paddle.distributed.fleet.meta_parallel import TensorParallel from paddle.distributed.sharding import group_sharded_parallel import paddleslim from ppfleetx.distributed.apis import env, amp from ppfleetx.optims import build_lr_scheduler, build_optimizer from ppfleetx.utils.log import logger, get_timestamp, convert_timestamp_to_data from ppfleetx.core.engine import BasicEngine, InferenceEngine, TensorRTConfig from ppfleetx.core.module import BasicModule from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters from ppfleetx.utils.version import version_check from ppfleetx.utils.export import export_inference_model from paddle.incubate.distributed.utils.io import save_for_auto_inference from ppfleetx.utils.device import synchronize as device_synchronize from ppfleetx.utils.compression_helper import prune_model, quant_model class EagerEngine(BasicEngine): """ The common engine for all models that support single-card and distributed training, validation and test. Only used in eager dygraph mode. """ def __init__(self, configs, module, optimizer=None, lr=None, mode='train'): """ Initialize an engine depending on the user-defined module and configs. Args: module(BasicModule): user-defined module. After assigning computations and configurations of model/optimizers/lr Schedulers, engine can support the whole loop of training/validation/test. configs(dict): the configurations that engine needs for training/validation/test loop. Such as mix precision strategy, save&load and the infos of steps/epoches. Return: An instance of `EagerEngine`. Examples:: class TestModule(BasicModule): def __init__(self): super().__init__() self.model = paddle.nn.Linear(28 * 28, 10) self.loss_fn = paddle.nn.MSELoss() def forward(self, x): return paddle.relu(self.model(x.reshape(-1))) def training_step(self, batch): x, y = batch loss = self.loss_fn(self(x), y) return loss def configure_optimizers(self): return paddle.optimizer.Adam( parameters=self.model.parameters(), learning_rate=0.02) module = TestModule() engine = EagerEngine(module, configs) """ super().__init__() version_check() self.mode = mode if not isinstance(module, BasicModule): raise TypeError( "'module' must be sub classes of `BasicModule`, but got: {model.__class__.__name__}." ) self._module = module if module.model and not isinstance( module.model, nn.Layer) and not callable(module.model): raise TypeError( "'model' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.model.__class__.__name__}." ) # if mode == 'train': # if module.loss_fn and not isinstance( # module.loss_fn, nn.Layer) and not callable(module.loss_fn): # raise TypeError( # "'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}." # ) # global configs self._global_batch_size = configs['Global']['global_batch_size'] # engine configs self._configs = configs['Engine'] self._run_mode = self._configs.get('run_mode', 'step') assert self._run_mode in ['epoch', 'step' ], 'run_mode must be epoch or step' self._max_steps = self._configs['max_steps'] self._eval_freq = self._configs['eval_freq'] self._eval_iters = self._configs['eval_iters'] self._test_iters = self._configs['test_iters'] self._logging_freq = self._configs['logging_freq'] self._num_train_epochs = self._configs['num_train_epochs'] self._accumulate_steps = self._configs['accumulate_steps'] amp_config = self._configs['mix_precision'] self._amp_enable = amp_config['enable'] if mode == 'export' and self._amp_enable: logger.info("NOTE: disable mix_precision in export mode") self._amp_enable = False self._amp_dtype = amp_config.get('dtype', 'float16') self._amp_level = amp_config.get('level', 'O2') self._use_main_grad = amp_config.get('use_main_grad', False) self._scale_loss = amp_config['scale_loss'] self._custom_black_list = amp_config['custom_black_list'] self._custom_white_list = amp_config['custom_white_list'] self._save_steps = self._configs['save_load']['save_steps'] self._save_epoch = self._configs['save_load']['save_epoch'] self._output_dir = self._configs['save_load']['output_dir'] self._ckpt_dir = self._configs['save_load']['ckpt_dir'] self._compress_configs = None self.prune_configs = None self.quant_configs = None self._quant_mode = False if 'Compress' in configs: self.mode = 'compress' self._compress_configs = configs['Compress'] if "Prune" in self._compress_configs: self.prune_configs = self._compress_configs["Prune"] if "Quantization" in self._compress_configs: self.quant_configs = self._compress_configs["Quantization"] self._quant_mode = True self.compress_model() # TODO(haohongxiang): Remove there extra configs after reconstruct of Fleet API self._dist_configs = configs['Distributed'] self._dp_degree = self._dist_configs['dp_degree'] self._mp_degree = self._dist_configs['mp_degree'] self._pp_degree = self._dist_configs['pp_degree'] sharding_config = self._dist_configs['sharding'] self._sharding_stage = sharding_config['sharding_stage'] self._sharding_degree = sharding_config['sharding_degree'] self._sharding_offload = sharding_config['sharding_offload'] self._reduce_overlap = sharding_config['reduce_overlap'] self._broadcast_overlap = sharding_config['broadcast_overlap'] self._use_recompute = configs['Model']['use_recompute'] if self._amp_enable: if mode == 'train' and self._amp_dtype == "float16": self._scaler = paddle.amp.GradScaler( init_loss_scaling=self._scale_loss) else: # bfloat16 self._scaler = paddle.amp.GradScaler( init_loss_scaling=1, use_dynamic_loss_scaling=False) # Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. if self._amp_level == "O2": self._module.model = paddle.amp.decorate( models=self._module.model, dtype=self._amp_dtype, level=self._amp_level) else: self._scaler = None if mode == 'train': self._use_increments = configs.Optimizer.lr.pop('use_increments', False) self._lr_scheduler_mode = configs.Optimizer.lr.pop('run_mode', 'step') assert self._lr_scheduler_mode in [ 'epoch', 'step' ], 'lr.run_mode must be epoch or step' self._lr_scheduler = build_lr_scheduler( configs.Optimizer.lr) if mode == 'train' else None self._optimizer = build_optimizer( configs.Optimizer, self._module.model, self._lr_scheduler) if mode == 'train' else None if self._amp_enable and self._amp_dtype in [ 'float16', 'bfloat16' ] and self._amp_level == 'O2' and self._use_main_grad: self._module.model = amp.MixPrecisionLayer( self._module.model, dtype=self._amp_dtype) self._optimizer = amp.MixPrecisionOptimizer(self._optimizer) self._scaler = amp.MixPrecisionScaler(self._scaler) # distributed configs self._distributed = (dist.get_world_size() > 1) if self._distributed: self._hcg = env.get_hcg() self._dp_group = self._hcg.get_data_parallel_group() self._sharding_group = self._hcg.get_sharding_parallel_group() self._dp_rank = self._hcg.get_data_parallel_rank() self._mp_rank = self._hcg.get_model_parallel_rank() self._pp_rank = self._hcg.get_stage_id() self._sharding_rank = self._hcg.get_sharding_parallel_rank() self._wrap_with_fleet() else: self._dp_rank = 0 # using for save/load self._load_recovery = {'step': 0, 'epoch': 0, 'rng_state': -1} if 'Inference' in configs: self._inference_configs = configs['Inference'] self._inference_engine = None self.profiler = None if 'Profiler' in configs and configs.get('Profiler', {}).get('enable', False): self.profiler_config = configs['Profiler'] scheduler = self.profiler_config.get('scheduler', None) profiler_log = self.profiler_config.get('profiler_log', './profiler_log') record_shapes = self.profiler_config.get('record_shapes', True) profile_memory = self.profiler_config.get('profile_memory', True) self.profiler = paddle.profiler.Profiler( targets=[ paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU ], scheduler=scheduler, on_trace_ready=paddle.profiler.export_chrome_tracing( profiler_log), record_shapes=record_shapes, profile_memory=profile_memory) self.profiler.start() logger.warning( "Profiler is enabled, do not enable it in production.") def _wrap_with_fleet(self): if self._sharding_stage in [2, 3]: assert self._pp_degree == 1, "sharding stage2/3 will support pipeline parallel later" self._wrap_sharding_2_3() else: self._wrap_3D_parallel() def _wrap_sharding_2_3(self): if self._dp_degree > 1 and self._sharding_stage == 3: sync_params_buffers( self._module.model, comm_group=self._dp_group, src_rank=self._dp_group.ranks[0]) if self._mp_degree > 1: assert self._sharding_stage == 2, "only support mp + sharding stage2 hybrid parallel now." self._module.model = TensorParallel( self._module.model, self._hcg, strategy=None) level = "p_g_os" if self._sharding_stage == 3 else "os_g" origin_model = self._module.model self._module.model, self._optimizer, self._scaler = group_sharded_parallel( model=self._module.model, optimizer=self._optimizer, level=level, scaler=self._scaler, group=self._sharding_group, offload=self._sharding_offload, dp_group=self._dp_group if self._dp_group.nranks > 1 else None) if self._reduce_overlap: self._module.model._set_reduce_overlap(self._reduce_overlap) if self._broadcast_overlap: self._optimizer._set_broadcast_overlap( self._broadcast_overlap, layers=origin_model, num_groups=2) def _wrap_3D_parallel(self): if isinstance(self._module.model, amp.MixPrecisionLayer): if dist.get_world_size() == self._dp_degree: sync_params_buffers( self._module.model, comm_group=self._dp_group, src_rank=self._dp_group.ranks[0]) elif self._pp_degree > 1: self._module.model = fleet.distributed_model( self._module.model._layers) else: self._module.model = fleet.distributed_model(self._module.model) self._optimizer = fleet.distributed_optimizer(self._optimizer) self._scaler = fleet.distributed_scaler( self._scaler) if self._scaler is not None else self._scaler def _train_one_epoch(self, epoch_index, train_data_loader=None, valid_data_loader=None): self._module.model.train() # time count train_losses = [] train_step_start = get_timestamp() skip_first = True # Note(GuoxiaWang): Do not use len(train_data_loader()), # it will cause a memory leak. total_train_batch = self._max_steps if self._run_mode == 'step' else len( train_data_loader) total_train_step = self._max_steps if self._run_mode == 'step' else total_train_batch * self._num_train_epochs total_eval_batch = len( valid_data_loader) if valid_data_loader is not None else 0 valid_data_loader = valid_data_loader( ) if valid_data_loader is not None else None eval_finished_step = 0 for step, batch in enumerate(train_data_loader()): if epoch_index == self._load_recovery['epoch']: if step < self._load_recovery['step']: continue loss = self._fit_impl(batch) train_losses.append(loss) if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step': if self._scaler is None or self._scaler._found_inf == 0: self._lr_scheduler.step(epoch=self._global_batch_size if self._use_increments else None) if (step + 1) % self._logging_freq == 0: train_step_cost = get_timestamp() - train_step_start numpy_losses = [float(loss) for loss in train_losses] log_dict = { 'epoch': epoch_index, 'total_epoch': self._num_train_epochs, 'batch': step, 'total_batch': total_train_batch, 'total_step': total_train_step, 'train_cost': train_step_cost if step == 0 else train_step_cost / self._logging_freq, 'loss': sum(numpy_losses) / len(numpy_losses), 'lr': self._optimizer.get_lr(), 'found_inf': self._scaler._found_inf if self._scaler is not None else 0, } if self._amp_enable: log_dict['loss_scale'] = self._scaler._scale.numpy()[0] self._module.training_step_end(log_dict) train_step_start = get_timestamp() train_losses = [] self._optimizer.clear_grad() if self._run_mode == 'step' and not skip_first: if self._eval_freq > 0 and step % self._eval_freq == 0: eval_losses = [] eval_step_start = get_timestamp() for eval_step, batch in enumerate(valid_data_loader): eval_finished_step += 1 loss = self._evaluate_impl(batch) eval_losses.append(loss) if eval_step >= self._eval_iters - 1: break eval_step_cost = get_timestamp() - eval_step_start eval_loss = sum(eval_losses) / len(eval_losses) log_dict = { 'loss': float(eval_loss), 'epoch': epoch_index, 'batch': eval_finished_step, 'total_batch': total_eval_batch, 'eval_cost': eval_step_cost / self._logging_freq, } self._module.validation_step_end(log_dict) if self._save_steps > 0 and step % self._save_steps == 0: device_synchronize() self.save(epoch=epoch_index, step=step) else: skip_first = False if self._run_mode == 'step' and step >= self._max_steps: return if self.profiler: self.profiler.step() def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None): """ Run the full process of training/validation/save loop. Args: epoch(int): the epoch index. train_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying training samples. valid_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying validation samples. """ self._module.model.train() train_start = get_timestamp() start_epoch = self._load_recovery['epoch'] if self._load_recovery['rng_state'] != -1: paddle.set_cuda_rng_state(self._load_recovery['rng_state']) for epoch_index in range(start_epoch, epoch): train_epoch_start = get_timestamp() self._train_one_epoch(epoch_index, train_data_loader, valid_data_loader) train_epoch_cost = get_timestamp() - train_epoch_start log_dict = { 'epoch': epoch_index, 'train_cost': train_epoch_cost, } self._module.training_epoch_end(log_dict) if self._lr_scheduler is not None and self._lr_scheduler_mode == 'epoch': self._lr_scheduler.step() if self._run_mode == 'epoch' and self._eval_freq > 0 and \ epoch_index % self._eval_freq == 0: eval_epoch_start = get_timestamp() self._evaluate_one_epoch(epoch_index, valid_data_loader) eval_epoch_cost = get_timestamp() - eval_epoch_start log_dict = { 'epoch': epoch_index, 'eval_cost': eval_epoch_cost, } self._module.validation_epoch_end(log_dict) if self._save_epoch > 0 and self._run_mode == 'epoch' and epoch_index % self._save_epoch == 0: self.save(epoch=epoch_index, step=len(train_data_loader)) logger.info( "The training process is complete and total cost of time for training is : {}". format(convert_timestamp_to_data(get_timestamp() - train_start))) if self.profiler: self._profiler_done() def _fit_impl(self, batch): self._module.model.train() batch = self._module.pretreating_batch(batch) if self._pp_degree == 1: if self._use_recompute and isinstance(self._module.model, paddle.DataParallel): with self._module.model.no_sync(): loss = self._model_forward_backward(batch) if not hasattr(self._optimizer, "all_fused_tensors" ) or self._optimizer.all_fused_tensors is None: try: fused_allreduce_gradients( list(self._module.model.parameters()), None) except: m = self._module.model.state_dict() fused_allreduce_gradients( list(self._module.model.parameters()), None) else: all_reduce_parameters(self._optimizer.all_fused_tensors, self._dp_group) elif isinstance(self._module.model, amp.MixPrecisionLayer) \ and self._distributed and dist.get_world_size() == self._dp_degree: loss = self._model_forward_backward(batch) fused_allreduce_gradients( list(self._module.model.parameters()), None) else: loss = self._model_forward_backward(batch) else: with paddle.amp.auto_cast( enable=self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, level=self._amp_level): batch = self._module.model._prepare_training( batch, self._optimizer, self._lr_scheduler) loss = self._module.model.forward_backward_pipeline( batch, self._scaler) self._optim_update_params() return loss def _model_forward_backward(self, batch): if self._accumulate_steps == 1 or self._pp_degree > 1: batches = [batch] else: split_batches = [ paddle.split(b, self._accumulate_steps) for b in batch ] batches = [] for i in range(len(split_batches[0])): micro_batch = [split_batch[i] for split_batch in split_batches] batches.append(micro_batch) final_loss = None for micro_batch in batches: with paddle.amp.auto_cast( self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, level=self._amp_level): loss = self._module.training_step(micro_batch) if self._amp_enable and self._amp_dtype == "float16": loss_bw = self._scaler.scale(loss) else: loss_bw = loss if self._accumulate_steps > 1: # div the loss for backward loss_bw = loss_bw / self._accumulate_steps self._module.backward(loss_bw) detach_loss = loss.detach() if final_loss is None: final_loss = detach_loss else: final_loss = paddle.add(final_loss, detach_loss) if self._accumulate_steps > 1: # div the loss for print final_loss = final_loss / self._accumulate_steps return final_loss def _optim_update_params(self): if self._sharding_stage in [3] and self._dp_degree > 1: fused_allreduce_gradients(self._module.model.parameters(), self._hcg) for p in self._module.model.parameters(): if hasattr(p, "bw_storage"): assert p.grad is None, "This case shouldn't happen." p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) if self._amp_enable and self._amp_dtype == "float16": self._scaler.step(self._optimizer) self._scaler.update() else: self._optimizer.step() @paddle.no_grad() def evaluate(self, epoch=1, valid_data_loader=None): """ run one evaluation epoch over the validation set. Args: epoch(int): the epoch index. valid_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying validation samples. """ self._module.model.eval() for epoch_index in range(epoch): eval_epoch_start = get_timestamp() self._evaluate_one_epoch(epoch_index, valid_data_loader) eval_epoch_cost = get_timestamp() - eval_epoch_start log_dict = { 'epoch': epoch_index, 'eval_cost': eval_epoch_cost, } self._module.validation_epoch_end(log_dict) logger.info("The evaluting process is complete.") del valid_data_loader return @paddle.no_grad() def _evaluate_one_epoch(self, epoch=1, valid_data_loader=None): self._module.model.eval() eval_step_start = get_timestamp() eval_losses = [] total_eval_batch = len(valid_data_loader) valid_data_loader = valid_data_loader( ) if valid_data_loader is not None else None for eval_step, batch in enumerate(valid_data_loader): loss = self._evaluate_impl(batch) eval_losses.append(float(loss)) if eval_step % self._logging_freq == 0: eval_step_cost = get_timestamp() - eval_step_start log_dict = { 'loss': sum(eval_losses) / len(eval_losses), 'epoch': epoch, 'batch': eval_step, 'total_batch': total_eval_batch, 'eval_cost': eval_step_cost if eval_step == 0 else eval_step_cost / self._logging_freq, } self._module.validation_step_end(log_dict) eval_step_start = get_timestamp() eval_losses = [] if self._run_mode == 'step' and eval_step >= self._max_steps: logger.info("[eval] epoch {} : evaluting process is complete.". format(epoch)) return @paddle.no_grad() def _evaluate_impl(self, batch): self._module.model.eval() batch = self._module.pretreating_batch(batch) with paddle.amp.auto_cast( self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, level=self._amp_level): if self._pp_degree == 1: loss = self._module.validation_step(batch) else: loss = self._module.model.eval_batch(batch, compute_loss=True) return loss @paddle.no_grad() def predict(self, epoch=1, test_data_loader=None): """ run one evaluation epoch over the test set. Args: epoch(int): the epoch index. test_data_loader(DataLoader, None): a collection of :class:`paddle.io.DataLoader`, specifying test samples. """ self._module.model.eval() test_start = get_timestamp() test_losses = [] test_data_loader = test_data_loader() for test_step, batch in enumerate(test_data_loader): loss = self._predict_impl(batch) test_losses.append(float(loss)) if test_step % self._logging_freq == 0: test_cost = get_timestamp() - test_start log_dict = { 'loss': sum(test_losses) / len(test_losses), 'epoch': epoch, 'batch': test_step, 'test_cost': test_cost if test_step == 0 else test_cost / self._logging_freq, } self._module.test_step_end(log_dict) test_start = get_timestamp() test_losses = [] if test_step >= self._max_steps: logger.info("The predicting process is complete.") del test_data_loader return @paddle.no_grad() def _predict_impl(self, batch): self._module.model.eval() batch = self._module.pretreating_batch(batch) with paddle.amp.auto_cast( self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, level=self._amp_level): if self._pp_degree == 1: loss = self._module.test_step(batch) else: loss = self._module.model.eval_batch(batch, compute_loss=True) return loss def save(self, epoch=0, step=0): """ save the state dicts of model and optimizer into an checkpoint. """ if self._dp_rank != 0: logger.info("DP_Rank %d doesn't save model" % self._dp_rank) return if self._output_dir and isinstance(self._output_dir, str): output_dir = os.path.join(self._output_dir, "epoch_%d_step_%d" % (epoch, step)) if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) logger.info("Save model to %s" % output_dir) save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( output_dir, self._mp_rank, self._sharding_rank, self._pp_rank) if self._distributed else output_dir if self._sharding_stage == 3: self._module.model.get_all_parameters(convert2cpu=False) paddle.save(self._module.model.state_dict(), os.path.join(save_dir, "model.pdparams")) paddle.save(self._optimizer.state_dict(), os.path.join(save_dir, "model_state.pdopt")) meta_dict = { "epoch": epoch, "step": step, "cuda_rng_state": paddle.get_cuda_rng_state() } paddle.save(meta_dict, os.path.join(save_dir, "meta_state.pdopt")) save_auto_dir = os.path.join(output_dir, "auto_infer") save_for_auto_inference( os.path.join(save_auto_dir, "auto"), self._module.model) else: raise TypeError("`save` requires a valid value of `output_dir`.") def compress_model(self): if self._compress_configs is None: return self._distributed = (dist.get_world_size() > 1) # Load pretrained model before compression if 'pretrained' in self._compress_configs and self._compress_configs[ 'pretrained'] is not None: self._ckpt_dir = self._compress_configs['pretrained'] self.load() # Avoid loading again self._configs['save_load']['ckpt_dir'] = None if self.prune_configs is not None and self.prune_configs.enable: prune_model(self._module.model, self.prune_configs, self._module.input_spec()) #NOTE(minghaoBD): We haven't fully tested Prune+Quantization, so an "else if" is put here for separation. elif self.quant_configs is not None and self.quant_configs.enable: self._module.model, self.quanter = quant_model(self._module.model, self.quant_configs) def load(self): """ load the saved checkpoint file and update the state dicts of model and optimizer. """ if self._ckpt_dir and isinstance(self._ckpt_dir, str): logger.info("Try to load checkpoint from %s " % self._ckpt_dir) if self._quant_mode: load_dir = self._ckpt_dir else: load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( self._ckpt_dir, self._mp_rank, self._sharding_rank, self._pp_rank) if self._distributed else self._ckpt_dir model_path = os.path.join(load_dir, "model.pdparams") opt_path = os.path.join(load_dir, "model_state.pdopt") meta_path = os.path.join(load_dir, "meta_state.pdopt") if os.path.exists(model_path): model_dict = paddle.load(model_path) for name, param in self._module.model.state_dict().items(): assert name in model_dict.keys( ), "No param named `{}` was found in checkpoint file.".format( name) if param.dtype != model_dict[name].dtype: model_dict[name] = model_dict[name].cast(param.dtype) self._module.model.set_state_dict(model_dict) else: raise ValueError("No optimizer checkpoint file found in %s." % model_path) if self.mode == 'train': if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) self._optimizer.set_state_dict(opt_dict) else: raise ValueError( "No optimizer checkpoint file found in %s." % opt_path) if os.path.exists(meta_path): meta_dict = paddle.load(meta_path) self._load_recovery = { 'step': meta_dict['step'], 'epoch': meta_dict['epoch'], 'rng_state': meta_dict['cuda_rng_state'] } else: raise ValueError("No meta checkpoint file found in %s." % meta_path) logger.info("successfully load checkpoints") else: logger.warning("`load` requires a valid value of `ckpt_dir`.") raise TypeError("`load` requires a valid value of `ckpt_dir`.") def export(self): self._module.model.eval() input_spec = self._module.input_spec() save_dir = os.path.join(self._output_dir, "rank_{}".format(self._dp_rank)) if not self._quant_mode: export_inference_model(self._module.model, input_spec, save_dir, 'model') else: logger.info("export quantized model.") export_inference_model( self._module.model, input_spec, save_dir, 'model', export_quant_model=True, quanter=self.quanter) def inference(self, data): if self._inference_engine is None: # parse TensorRT config tensorrt_config = None if 'TensorRT' in self._inference_configs: tensorrt_config = TensorRTConfig( **self._inference_configs['TensorRT']) self._inference_engine = InferenceEngine( self._inference_configs['model_dir'], self._inference_configs['mp_degree'], tensorrt_config) return self._inference_engine.predict(data) def _print_summary(self): views_dict = { SummaryView.DeviceView: 'device', SummaryView.OverView: 'overview', SummaryView.ModelView: 'model', SummaryView.DistributedView: 'dist', SummaryView.KernelView: 'kernel', SummaryView.OperatorView: 'op', SummaryView.MemoryView: 'mem', SummaryView.MemoryManipulationView: 'memcpy', SummaryView.UDFView: 'udf', } default_views = [ SummaryView.OverView, SummaryView.ModelView, SummaryView.KernelView, SummaryView.OperatorView, ] def gen_views(cfg): # print all summary view if detailed=True if self.profiler_config.get('detailed', False): return None views = [] # override default view with user defined value if detailed=False for view in SummaryView: v = self.profiler_config.get('summary', {}).get( views_dict[view], None) if v is True or (v is None and view in default_views): views.append(view) return views or None self.profiler.summary( sorted_by=paddle.profiler.SortedKeys.GPUTotal, views=gen_views(self.profiler_config)) def _profiler_done(self): if not self.profiler: return logger.info("Profiler finished, prepare to print summary...") self.profiler.stop() self._print_summary() profiler_log = self.profiler_config.get('profiler_log', './profiler_log') logger.info( "For more information please install visualdl and run it with following command:" ) logger.info( "-------------------------------------------------------------------------------" ) logger.info(f"visualdl --host 0.0.0.0 --logdir {profiler_log}") logger.info( "-------------------------------------------------------------------------------" ) ================================================ FILE: ppfleetx/core/engine/inference_engine.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import numpy as np from collections.abc import Sequence, Mapping import paddle import paddle.distributed.fleet as fleet # TensorRT precisions TRT_PRECISIONS = { 'fp32': paddle.inference.PrecisionType.Float32, 'fp16': paddle.inference.PrecisionType.Half, 'int8': paddle.inference.PrecisionType.Int8, } class _StaticGuard(object): def __init__(self): pass def __enter__(self): paddle.enable_static() def __exit__(self, exc_type, exc_val, exc_tb): paddle.disable_static() class TensorRTConfig(object): """ TensorRT Inference Configuration Args: max_batch_size (int): The maxmum batch size of input data. Default 1 workspace_size (int): The size of TensorRT workspace in bytes. Default 1<<30 min_subgraph_size (int): The minimum subgraph node size to convert subgraph to TensorRT engine. Default 3 precision (str): The inference precision, can be 'fp32', 'fp16' and 'int8'. Default 'fp16' use_static (bool): Whether to serialize and save TensorRT engine. Default False use_calib_mode (bool): Whether to use TensorRT calibration. Default False collect_shape (bool): Whether to collect dynamic shape. Default False shape_range_info_filename (str): Path to dynamic shape range file. Default None """ def __init__(self, max_batch_size=1, workspace_size=1 << 30, min_subgraph_size=3, precision='fp16', use_static=False, use_calib_mode=False, collect_shape=False, shape_range_info_filename=None): self.max_batch_size = max_batch_size self.workspace_size = eval(workspace_size) self.min_subgraph_size = min_subgraph_size self.precision = precision self.use_static = use_static self.use_calib_mode = use_calib_mode self.shape_range_info_filename = shape_range_info_filename self.collect_shape = collect_shape @property def precision(self): return TRT_PRECISIONS[self._precision] @precision.setter def precision(self, value): print("value", value) assert value.lower() in ['fp32', 'fp16', 'int8'], \ "TensorRT precision can only be 'fp32', 'fp16' or 'int8', " \ "but got {}".format(value.lower()) self._precision = value.lower() @property def collect_shape(self): return self._collect_shape @collect_shape.setter def collect_shape(self, value): if value: assert self.shape_range_info_filename is not None, \ "shape_range_info_filename should be set in " \ "collect_shape mode" else: assert self.shape_range_info_filename and \ os.path.isfile(self.shape_range_info_filename), \ "shape_range_info_filename {} is not a " \ "file".format(self.shape_range_info_filename) self._collect_shape = value class InferenceEngine(object): """ Model Parallel Inference Engine Args: model_dir (string): root directory of inference model mp_degree (int): model parallel size tensorrt_config (TensorRTConfig): configurations for TensorRT inference """ def __init__(self, model_dir, mp_degree=1, tensorrt_config=None, device=None): self.model_dir = model_dir self.mp_degree = mp_degree self.tensorrt_config = tensorrt_config self.auto = False self.device = device for fname in os.listdir(model_dir): if "auto" in fname: self.auto = True break if mp_degree == 1: self.nranks = 1 self.rank = 0 else: self.nranks = fleet.worker_num() self.rank = fleet.worker_index() if not self.auto: self._check_model() self._static_guard = _StaticGuard() with self._static_guard: self._init_predictor() def _check_model(self): if not os.path.isdir(self.model_dir): raise ValueError('model_dir is not a directory') rank_path = os.path.join(self.model_dir, "rank_{}".format(self.rank)) if not os.path.isdir(rank_path): raise ValueError('rank_{} directory not found'.format(self.rank)) model_files = [] param_files = [] for fname in os.listdir(rank_path): if os.path.splitext(fname)[1] == '.pdmodel': model_files.append(fname) if os.path.splitext(fname)[1] == '.pdiparams': param_files.append(fname) def _check_and_get_file(files, tag): if len(files) == 0: raise ValueError("no {} file found under {}".format(tag, rank_path)) elif len(files) > 1: raise ValueError("multiple {} file found under {}".format( tag, rank_path)) else: return os.path.join(self.model_dir, 'rank_{}'.format(self.rank), files[0]) self.model_file = _check_and_get_file(model_files, 'pdmodel') self.param_file = _check_and_get_file(param_files, 'pdiparams') def _generate_comm_init_config(self, rank, nranks): ring_id_to_ranks = ','.join(['0'] + [str(i) for i in range(nranks)]) rank_to_ring_ids = ''.join(['{},0\n'.format(i) for i in range(nranks)]) comm_str = '[ring_id -> ranks]\n' + ring_id_to_ranks + \ '\n[rank -> ring_ids]\n' + rank_to_ring_ids config_fname = "./.comm_config{}.csv".format(rank) if os.path.exists(config_fname): os.remove(config_fname) with open(config_fname, 'w') as f: f.write(comm_str) return config_fname def _init_predictor(self): if self.auto: self.model_file = os.path.join( self.model_dir, 'auto_dist{}.pdmodel'.format(self.rank)) self.param_file = os.path.join( self.model_dir, 'auto_dist{}.pdiparams'.format(self.rank)) config = paddle.inference.Config(self.model_file, self.param_file) config.enable_memory_optim() config.switch_ir_optim(True) if self.device: device_id = int( os.environ.get(f'FLAGS_selected_{self.device}s', 0)) config.enable_custom_device(self.device, device_id) elif paddle.fluid.core.is_compiled_with_cuda(): device_id = int(os.environ.get('FLAGS_selected_gpus', 0)) config.enable_use_gpu(100, device_id) elif paddle.fluid.core.is_compiled_with_xpu(): device_id = int(os.environ.get('FLAGS_selected_xpus', 0)) config.enable_xpu() config.set_xpu_device_id(device_id) # distributed config if self.mp_degree > 1: trainer_endpoints = fleet.worker_endpoints() current_endpoint = trainer_endpoints[self.rank] dist_config = config.dist_config() dist_config.set_ranks(self.nranks, self.rank) dist_config.set_endpoints(trainer_endpoints, current_endpoint) dist_config.enable_dist_model(True) if self.auto: config_fname = os.path.join(self.model_dir, "rank_mapping.csv") else: config_fname = self._generate_comm_init_config(self.rank, self.nranks) dist_config.set_comm_init_config(config_fname) config.set_dist_config(dist_config) # TensorRT config if self.tensorrt_config: config.enable_tensorrt_engine( max_batch_size=self.tensorrt_config.max_batch_size, workspace_size=self.tensorrt_config.workspace_size, min_subgraph_size=self.tensorrt_config.min_subgraph_size, precision_mode=self.tensorrt_config.precision, use_static=self.tensorrt_config.use_static, use_calib_mode=self.tensorrt_config.use_calib_mode) if self.tensorrt_config.collect_shape: config.collect_shape_range_info( self.tensorrt_config.shape_range_info_filename) else: config.enable_tuned_tensorrt_dynamic_shape( self.tensorrt_config.shape_range_info_filename, True) self.predictor = paddle.inference.create_predictor(config) def input_names(self): return self.predictor.get_input_names() def output_names(self): return self.predictor.get_output_names() def predict(self, data): # data in dict/list format with self._static_guard: if isinstance(data, Sequence): if len(data) != len(self.input_names()): raise ValueError() for d, name in zip(data, self.input_names()): handle = self.predictor.get_input_handle(name) handle.copy_from_cpu(np.array(d.copy())) elif isinstance(data, Mapping): # key check for k, v in data.items(): handle = self.predictor.get_input_handle(k) handle.copy_from_cpu(np.array(v)) else: raise ValueError() self.predictor.run() return {name: self.predictor.get_output_handle(name).copy_to_cpu() \ for name in self.output_names()} ================================================ FILE: ppfleetx/core/module/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .basic_module import BasicModule ================================================ FILE: ppfleetx/core/module/basic_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The file has been adapted from lightning file: # https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/module.py # Git commit hash: 2d9e00fab64c8b19a8646f755a95bcb092aa710f # We retain the following license from the original files: # Copyright 2018-2021 William Falcon. All rights reserved. # # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. import paddle import paddle.nn as nn class BasicModule(nn.Layer): """ """ def __init__(self, configs, *args, **kwargs): self.configs = self.process_configs(configs) super().__init__(*args, **kwargs) self.model = self.get_model() def process_configs(self, configs): return configs def get_model(self): raise NotImplementedError def get_loss_fn(self): pass def pretreating_batch(self, batch): return batch def forward(self, *args, **kwargs): return super().forward(*args, **kwargs) def training_step(self, *args, **kwargs): raise NotImplementedError def training_step_end(self, *args, **kwargs): pass def validation_step(self, *args, **kwargs): pass def validation_step_end(self, *args, **kwargs): pass def test_step(self, *args, **kwargs): pass def test_step_end(self, *args, **kwargs): pass def backward(self, loss): loss.backward() def input_spec(self): raise NotImplementedError( "Please redefine Module.input_spec for model export") def inference_end(self, outputs): pass def training_epoch_end(self, *args, **kwargs): pass def validation_epoch_end(self, *args, **kwargs): pass ================================================ FILE: ppfleetx/data/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import random import numpy as np import paddle from ppfleetx.data import dataset, sampler, utils from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger def build_auto_dataset(config, mode): """ build dataset for auto parallel """ assert mode in ['Train', 'Eval', 'Test' ], "Dataset mode should be Train, Eval, Test" if mode not in config: return None dataset = build_dataset(config, mode) collate_fn = None if 'collate_fn' in config[mode].keys(): collate_fn_cfg = config[mode].pop('collate_fn', None) if isinstance(collate_fn_cfg, str): collate_fn = getattr( utils, collate_fn_cfg) if collate_fn_cfg is not None else None elif isinstance(collate_fn_cfg, dict): collate_fn_class_name = collate_fn_cfg.pop("name") collate_fn = eval("utils.{}".format(collate_fn_class_name))( **collate_fn_cfg) logger.debug("build collate_fn({}) success...".format(collate_fn)) dataset.collate_fn = collate_fn dataset.sample_split = config[mode].pop('sample_split', None) return dataset def build_dataset(config, mode): # build dataset config_dataset = config[mode].dataset config_dataset = copy.deepcopy(config_dataset) dataset_name = config_dataset.pop('name') dataset = eval("dataset.{}".format(dataset_name))(**config_dataset) logger.debug("build dataset({}) success...".format(dataset)) return dataset def build_dataloader(config, mode): assert mode in ['Train', 'Eval', 'Test' ], "Dataset mode should be Train, Eval, Test" if mode not in config: return None dataset = build_dataset(config, mode) batch_sampler = None # build sampler if 'sampler' in config[mode].keys(): config_sampler = config[mode].sampler config_sampler = copy.deepcopy(config_sampler) sampler_name = config_sampler.pop("name") batch_sampler = eval("sampler.{}".format(sampler_name))( dataset, **config_sampler) logger.debug("build batch_sampler({}) success...".format( batch_sampler)) collate_fn = None config_loader = {} # build dataloader if 'loader' in config[mode].keys(): config_loader = config[mode].loader config_loader = copy.deepcopy(config_loader) collate_fn_cfg = config_loader.pop('collate_fn', None) if isinstance(collate_fn_cfg, str): collate_fn = getattr( utils, collate_fn_cfg) if collate_fn_cfg is not None else None elif isinstance(collate_fn_cfg, dict): collate_fn_class_name = collate_fn_cfg.pop("name") collate_fn = eval("utils.{}".format(collate_fn_class_name))( **collate_fn_cfg) logger.debug("build collate_fn({}) success...".format(collate_fn)) def worker_init_fn(worker_id): """ set seed in subproces for dataloader when num_workers > 0""" np.random.seed(env.get_dp_seed() + worker_id) random.seed(env.get_dp_seed() + worker_id) data_loader = paddle.io.DataLoader( dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, worker_init_fn=worker_init_fn, **config_loader) logger.debug("build data_loader({}) success...".format(data_loader)) return data_loader ================================================ FILE: ppfleetx/data/data_tools/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/data_tools/cpp/Makefile ================================================ CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color CPPFLAGS += $(shell $(PYTHON_BIN) -m pybind11 --includes) CPPFLAGS += $(shell python3-config --includes) LIBNAME = fast_index_map_helpers LIBEXT = .so default: $(LIBNAME)$(LIBEXT) %$(LIBEXT): %.cpp $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ ================================================ FILE: ppfleetx/data/data_tools/cpp/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/data_tools/cpp/compile.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import subprocess path = os.path.abspath(os.path.dirname(__file__)) def compile_helper(): """Compile helper function ar runtime. Make sure this is invoked on a single process.""" import sys excutable = sys.executable ret = subprocess.run(['make', '-C', path, f'PYTHON_BIN={excutable}']) if ret.returncode != 0: print("Making C++ dataset helpers module failed, exiting.") sys.exit(1) ================================================ FILE: ppfleetx/data/data_tools/cpp/fast_index_map_helpers.cpp ================================================ /* coding=utf-8 Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Helper methods for fast index mapping builds */ #include #include #include #include #include #include #include #include namespace py = pybind11; using namespace std; const int32_t LONG_SENTENCE_LEN = 512; void build_blending_indices( py::array_t &dataset_index, // NOLINT py::array_t &dataset_sample_index, // NOLINT const py::array_t &weights, const int32_t num_datasets, const int64_t size, const bool verbose) { /* Given multiple datasets and a weighting array, build samples such that it follows those wieghts.*/ if (verbose) { std::cout << "> building indices for blendable datasets ..." << std::endl; } // Get the pointer access without the checks. auto dataset_index_ptr = dataset_index.mutable_unchecked<1>(); auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>(); auto weights_ptr = weights.unchecked<1>(); // Initialize buffer for number of samples used for each dataset. int64_t current_samples[num_datasets]; for (int64_t i = 0; i < num_datasets; ++i) { current_samples[i] = 0; } // For each sample: for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx) { // Determine where the max error in sampling is happening. auto sample_idx_double = std::max(static_cast(sample_idx), 1.0); int64_t max_error_index = 0; double max_error = weights_ptr[0] * sample_idx_double - static_cast(current_samples[0]); for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) { double error = weights_ptr[dataset_idx] * sample_idx_double - static_cast(current_samples[dataset_idx]); if (error > max_error) { max_error = error; max_error_index = dataset_idx; } } // Populate the indices. dataset_index_ptr[sample_idx] = static_cast(max_error_index); dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index]; // Update the total samples. current_samples[max_error_index] += 1; } // print info if (verbose) { std::cout << " > sample ratios:" << std::endl; for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) { auto ratio = static_cast(current_samples[dataset_idx]) / static_cast(size); std::cout << " dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl; } } } py::array build_sample_idx(const py::array_t &sizes_, const py::array_t &doc_idx_, const int32_t seq_length, const int32_t num_epochs, const int64_t tokens_per_epoch) { /* Sample index (sample_idx) is used for gpt2 like dataset for which the documents are flattened and the samples are built based on this 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 1] is the starting offset in that document.*/ // Consistency checks. assert(seq_length > 1); assert(num_epochs > 0); assert(tokens_per_epoch > 1); // Remove bound checks. auto sizes = sizes_.unchecked<1>(); auto doc_idx = doc_idx_.unchecked<1>(); // Mapping and it's length (1D). int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length; int64_t *sample_idx = new int64_t[2 * (num_samples + 1)]; cout << " using:" << endl << std::flush; cout << " number of documents: " << doc_idx_.shape(0) / num_epochs << endl << std::flush; cout << " number of epochs: " << num_epochs << endl << std::flush; cout << " sequence length: " << seq_length << endl << std::flush; cout << " total number of samples: " << num_samples << endl << std::flush; // Index into sample_idx. int64_t sample_index = 0; // Index into doc_idx. int64_t doc_idx_index = 0; // Begining offset for each document. int64_t doc_offset = 0; // Start with first document and no offset. sample_idx[2 * sample_index] = doc_idx_index; sample_idx[2 * sample_index + 1] = doc_offset; ++sample_index; while (sample_index <= num_samples) { // Start with a fresh sequence. int64_t remaining_seq_length = seq_length + 1; while (remaining_seq_length != 0) { // Get the document length. auto doc_id = doc_idx[doc_idx_index]; auto doc_length = sizes[doc_id] - doc_offset; // And add it to the current sequence. remaining_seq_length -= doc_length; // If we have more than a full sequence, adjust offset and set // remaining length to zero so we return from the while loop. // Note that -1 here is for the same reason we have -1 in // `_num_epochs` calculations. if (remaining_seq_length <= 0) { doc_offset += (remaining_seq_length + doc_length - 1); remaining_seq_length = 0; } else { // Otherwise, start from the begining of the next document. ++doc_idx_index; doc_offset = 0; } } // Record the sequence. sample_idx[2 * sample_index] = doc_idx_index; sample_idx[2 * sample_index + 1] = doc_offset; ++sample_index; } // Method to deallocate memory. py::capsule free_when_done(sample_idx, [](void *mem_) { int64_t *mem = reinterpret_cast(mem_); delete[] mem; }); // Return the numpy array. const auto byte_size = sizeof(int64_t); return py::array(std::vector{num_samples + 1, 2}, // shape {2 * byte_size, byte_size}, // C-style contiguous strides sample_idx, // the data pointer free_when_done); // numpy array references } inline int32_t get_target_sample_len(const int32_t short_seq_ratio, const int32_t max_length, std::mt19937 &rand32_gen) { /* Training sample length. */ if (short_seq_ratio == 0) { return max_length; } const auto random_number = rand32_gen(); if ((random_number % short_seq_ratio) == 0) { return 2 + random_number % (max_length - 1); } return max_length; } template py::array build_mapping_impl(const py::array_t &docs_, const py::array_t &sizes_, const int32_t num_epochs, const uint64_t max_num_samples, const int32_t max_seq_length, const double short_seq_prob, const int32_t seed, const bool verbose, const int32_t min_num_sent) { /* Build a mapping of (start-index, end-index, sequence-length) where start and end index are the indices of the sentences in the sample and sequence-length is the target sequence length. */ // Consistency checks. assert(num_epochs > 0); assert(max_seq_length > 1); assert(short_seq_prob >= 0.0); assert(short_seq_prob <= 1.0); assert(seed > 0); // Remove bound checks. auto docs = docs_.unchecked<1>(); auto sizes = sizes_.unchecked<1>(); // For efficiency, convert probability to ratio. Note: rand() generates int. int32_t short_seq_ratio = 0; if (short_seq_prob > 0) { short_seq_ratio = static_cast(round(1.0 / short_seq_prob)); } if (verbose) { const auto sent_start_index = docs[0]; const auto sent_end_index = docs[docs_.shape(0) - 1]; const auto num_sentences = sent_end_index - sent_start_index; cout << " using:" << endl << std::flush; cout << " number of documents: " << docs_.shape(0) - 1 << endl << std::flush; cout << " sentences range: [" << sent_start_index << ", " << sent_end_index << ")" << endl << std::flush; cout << " total number of sentences: " << num_sentences << endl << std::flush; cout << " number of epochs: " << num_epochs << endl << std::flush; cout << " maximum number of samples: " << max_num_samples << endl << std::flush; cout << " maximum sequence length: " << max_seq_length << endl << std::flush; cout << " minimum sentences num: " << min_num_sent << endl << std::flush; cout << " short sequence probability: " << short_seq_prob << endl << std::flush; cout << " short sequence ration (1/prob): " << short_seq_ratio << endl << std::flush; cout << " seed: " << seed << endl << std::flush; } // Mapping and it's length (1D). int64_t num_samples = -1; DocIdx *maps = NULL; // Perform two iterations, in the first iteration get the size // and allocate memory and in the second iteration populate the map. bool second = false; for (int32_t iteration = 0; iteration < 2; ++iteration) { // Set the seed so both iterations produce the same results. std::mt19937 rand32_gen(seed); // Set the flag on second iteration. second = (iteration == 1); // Counters: uint64_t empty_docs = 0; uint64_t one_sent_docs = 0; uint64_t long_sent_docs = 0; // Current map index. uint64_t map_index = 0; // For each epoch: for (int32_t epoch = 0; epoch < num_epochs; ++epoch) { if (map_index >= max_num_samples) { if (verbose && (!second)) { cout << " reached " << max_num_samples << " samples after " << epoch << " epochs ..." << endl << std::flush; } break; } if (epoch > 0 && map_index == 0) { cout << endl << " No available documtment find this dataset." << endl << std::flush; throw std::invalid_argument( "Invalid dataset! the document should be with more than " + std::to_string(min_num_sent) + " scentences."); } // For each document: for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) { // Document sentences are in [sent_index_first, sent_index_last) const auto sent_index_first = docs[doc]; const auto sent_index_last = docs[doc + 1]; // At the begining of the document previous index is the // start index. auto prev_start_index = sent_index_first; // Remaining documents. auto num_remain_sent = sent_index_last - sent_index_first; // Some bookkeeping if ((epoch == 0) && (!second)) { if (num_remain_sent == 0) { ++empty_docs; } if (num_remain_sent == 1) { ++one_sent_docs; } } // Detect documents with long sentences. bool contains_long_sentence = false; if (num_remain_sent > 1) { for (auto sent_index = sent_index_first; sent_index < sent_index_last; ++sent_index) { if (sizes[sent_index] > LONG_SENTENCE_LEN) { if ((epoch == 0) && (!second)) { ++long_sent_docs; } contains_long_sentence = true; break; } } } // If we have more than two sentences. if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) { // Set values. auto seq_len = int32_t{0}; auto num_sent = int32_t{0}; auto target_seq_len = get_target_sample_len( short_seq_ratio, max_seq_length, rand32_gen); // Loop through sentences. for (auto sent_index = sent_index_first; sent_index < sent_index_last; ++sent_index) { // Add the size and number of sentences. seq_len += sizes[sent_index]; ++num_sent; --num_remain_sent; // If we have reached the target length. // and if not only one sentence is left in the document. // and if we have at least two sentneces. // and if we have reached end of the document. if (((seq_len >= target_seq_len) && (num_remain_sent > 1) && (num_sent >= min_num_sent)) || (num_remain_sent == 0)) { // Check for overflow. if ((3 * map_index + 2) > std::numeric_limits::max()) { cout << "number of samples exceeded maximum " << "allowed by type int64: " << std::numeric_limits::max() << endl; throw std::overflow_error("Number of samples"); } // Populate the map. if (second) { const auto map_index_0 = 3 * map_index; maps[map_index_0] = static_cast(prev_start_index); maps[map_index_0 + 1] = static_cast(sent_index + 1); maps[map_index_0 + 2] = static_cast(target_seq_len); } // Update indices / counters. ++map_index; prev_start_index = sent_index + 1; target_seq_len = get_target_sample_len( short_seq_ratio, max_seq_length, rand32_gen); seq_len = 0; num_sent = 0; } } // for (auto sent_index=sent_index_first; ... } // if (num_remain_sent > 1) { } // for (int doc=0; doc < num_docs; ++doc) { } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { cout << " number of empty documents: " << empty_docs << endl << std::flush; cout << " number of documents with one sentence: " << one_sent_docs << endl << std::flush; cout << " number of documents with long sentences: " << long_sent_docs << endl << std::flush; cout << " will create mapping for " << map_index << " samples" << endl << std::flush; } assert(maps == NULL); assert(num_samples < 0); maps = new DocIdx[3 * map_index]; num_samples = static_cast(map_index); } } // for (int iteration=0; iteration < 2; ++iteration) { // Shuffle. // We need a 64 bit random number generator as we might have more // than 2 billion samples. std::mt19937_64 rand64_gen(seed + 1); for (auto i = (num_samples - 1); i > 0; --i) { const auto j = static_cast(rand64_gen() % (i + 1)); const auto i0 = 3 * i; const auto j0 = 3 * j; // Swap values. swap(maps[i0], maps[j0]); swap(maps[i0 + 1], maps[j0 + 1]); swap(maps[i0 + 2], maps[j0 + 2]); } // Method to deallocate memory. py::capsule free_when_done(maps, [](void *mem_) { DocIdx *mem = reinterpret_cast(mem_); delete[] mem; }); // Return the numpy array. const auto byte_size = sizeof(DocIdx); return py::array(std::vector{num_samples, 3}, // shape {3 * byte_size, byte_size}, // C-style contiguous strides maps, // the data pointer free_when_done); // numpy array references } py::array build_mapping(const py::array_t &docs_, const py::array_t &sizes_, const int num_epochs, const uint64_t max_num_samples, const int max_seq_length, const double short_seq_prob, const int seed, const bool verbose, const int32_t min_num_sent) { if (sizes_.size() > std::numeric_limits::max()) { if (verbose) { cout << " using uint64 for data mapping..." << endl << std::flush; } return build_mapping_impl( docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose, min_num_sent); } else { if (verbose) { cout << " using uint32 for data mapping..." << endl << std::flush; } return build_mapping_impl( docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose, min_num_sent); } } template py::array build_blocks_mapping_impl( const py::array_t &docs_, const py::array_t &sizes_, const py::array_t &titles_sizes_, const int32_t num_epochs, const uint64_t max_num_samples, const int32_t max_seq_length, const int32_t seed, const bool verbose, const bool use_one_sent_blocks) { /* Build a mapping of (start-index, end-index, sequence-length) where start and end index are the indices of the sentences in the sample and sequence-length is the target sequence length. */ // Consistency checks. assert(num_epochs > 0); assert(max_seq_length > 1); assert(seed > 0); // Remove bound checks. auto docs = docs_.unchecked<1>(); auto sizes = sizes_.unchecked<1>(); auto titles_sizes = titles_sizes_.unchecked<1>(); if (verbose) { const auto sent_start_index = docs[0]; const auto sent_end_index = docs[docs_.shape(0) - 1]; const auto num_sentences = sent_end_index - sent_start_index; cout << " using:" << endl << std::flush; cout << " number of documents: " << docs_.shape(0) - 1 << endl << std::flush; cout << " sentences range: [" << sent_start_index << ", " << sent_end_index << ")" << endl << std::flush; cout << " total number of sentences: " << num_sentences << endl << std::flush; cout << " number of epochs: " << num_epochs << endl << std::flush; cout << " maximum number of samples: " << max_num_samples << endl << std::flush; cout << " maximum sequence length: " << max_seq_length << endl << std::flush; cout << " seed: " << seed << endl << std::flush; } // Mapping and its length (1D). int64_t num_samples = -1; DocIdx *maps = NULL; // Acceptable number of sentences per block. int min_num_sent = 2; if (use_one_sent_blocks) { min_num_sent = 1; } // Perform two iterations, in the first iteration get the size // and allocate memory and in the second iteration populate the map. bool second = false; for (int32_t iteration = 0; iteration < 2; ++iteration) { // Set the flag on second iteration. second = (iteration == 1); // Current map index. uint64_t map_index = 0; uint64_t empty_docs = 0; uint64_t one_sent_docs = 0; uint64_t long_sent_docs = 0; // For each epoch: for (int32_t epoch = 0; epoch < num_epochs; ++epoch) { // assign every block a unique id int32_t block_id = 0; if (map_index >= max_num_samples) { if (verbose && (!second)) { cout << " reached " << max_num_samples << " samples after " << epoch << " epochs ..." << endl << std::flush; } break; } // For each document: for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) { // Document sentences are in [sent_index_first, sent_index_last) const auto sent_index_first = docs[doc]; const auto sent_index_last = docs[doc + 1]; const auto target_seq_len = max_seq_length - titles_sizes[doc]; // At the begining of the document previous index is the // start index. auto prev_start_index = sent_index_first; // Remaining documents. auto num_remain_sent = sent_index_last - sent_index_first; // Some bookkeeping if ((epoch == 0) && (!second)) { if (num_remain_sent == 0) { ++empty_docs; } if (num_remain_sent == 1) { ++one_sent_docs; } } // Detect documents with long sentences. bool contains_long_sentence = false; if (num_remain_sent >= min_num_sent) { for (auto sent_index = sent_index_first; sent_index < sent_index_last; ++sent_index) { if (sizes[sent_index] > LONG_SENTENCE_LEN) { if ((epoch == 0) && (!second)) { ++long_sent_docs; } contains_long_sentence = true; break; } } } // If we have enough sentences and no long sentences. if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) { // Set values. auto seq_len = int32_t{0}; auto num_sent = int32_t{0}; // Loop through sentences. for (auto sent_index = sent_index_first; sent_index < sent_index_last; ++sent_index) { // Add the size and number of sentences. seq_len += sizes[sent_index]; ++num_sent; --num_remain_sent; // If we have reached the target length. // and there are an acceptable number of sentences left // and if we have at least the minimum number of sentences. // or if we have reached end of the document. if (((seq_len >= target_seq_len) && (num_remain_sent >= min_num_sent) && (num_sent >= min_num_sent)) || (num_remain_sent == 0)) { // Populate the map. if (second) { const auto map_index_0 = 4 * map_index; // Each sample has 4 items: the starting sentence index, ending // sentence index, // the index of the document from which the block comes (used // for fetching titles) // and the unique id of the block (used for creating block // indexes) maps[map_index_0] = static_cast(prev_start_index); maps[map_index_0 + 1] = static_cast(sent_index + 1); maps[map_index_0 + 2] = static_cast(doc); maps[map_index_0 + 3] = static_cast(block_id); } // Update indices / counters. ++map_index; ++block_id; prev_start_index = sent_index + 1; seq_len = 0; num_sent = 0; } } // for (auto sent_index=sent_index_first; ... } // if (num_remain_sent > 1) { } // for (int doc=0; doc < num_docs; ++doc) { } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { cout << " number of empty documents: " << empty_docs << endl << std::flush; cout << " number of documents with one sentence: " << one_sent_docs << endl << std::flush; cout << " number of documents with long sentences: " << long_sent_docs << endl << std::flush; cout << " will create mapping for " << map_index << " samples" << endl << std::flush; } assert(maps == NULL); assert(num_samples < 0); maps = new DocIdx[4 * map_index]; num_samples = static_cast(map_index); } } // for (int iteration=0; iteration < 2; ++iteration) { // Shuffle. // We need a 64 bit random number generator as we might have more // than 2 billion samples. std::mt19937_64 rand64_gen(seed + 1); for (auto i = (num_samples - 1); i > 0; --i) { const auto j = static_cast(rand64_gen() % (i + 1)); const auto i0 = 4 * i; const auto j0 = 4 * j; // Swap values. swap(maps[i0], maps[j0]); swap(maps[i0 + 1], maps[j0 + 1]); swap(maps[i0 + 2], maps[j0 + 2]); swap(maps[i0 + 3], maps[j0 + 3]); } // Method to deallocate memory. py::capsule free_when_done(maps, [](void *mem_) { DocIdx *mem = reinterpret_cast(mem_); delete[] mem; }); // Return the numpy array. const auto byte_size = sizeof(DocIdx); return py::array(std::vector{num_samples, 4}, // shape {4 * byte_size, byte_size}, // C-style contiguous strides maps, // the data pointer free_when_done); // numpy array references } py::array build_blocks_mapping( const py::array_t &docs_, const py::array_t &sizes_, const py::array_t &titles_sizes_, const int num_epochs, const uint64_t max_num_samples, const int max_seq_length, const int seed, const bool verbose, const bool use_one_sent_blocks) { if (sizes_.size() > std::numeric_limits::max()) { if (verbose) { cout << " using uint64 for data mapping..." << endl << std::flush; } return build_blocks_mapping_impl( docs_, sizes_, titles_sizes_, num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks); } else { if (verbose) { cout << " using uint32 for data mapping..." << endl << std::flush; } return build_blocks_mapping_impl( docs_, sizes_, titles_sizes_, num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks); } } PYBIND11_MODULE(fast_index_map_helpers, m) { m.def("build_mapping", &build_mapping); m.def("build_blocks_mapping", &build_blocks_mapping); m.def("build_sample_idx", &build_sample_idx); m.def("build_blending_indices", &build_blending_indices); } ================================================ FILE: ppfleetx/data/data_tools/ernie/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/README.md ================================================ # PaddleFleetX 预训练数据准备流程 本示例致力于打造基于PaddleFleetX预训练模型的最佳实践。 我们将预训练数据过程划分为以下部分 - 原始数据转换,原始文本转换为jsonl的json字符串格式。 - 数据ID化,断句、分词、tokenize转化为token id格式。 - 训练index文件生成,生成train、valid、test的每个样本索引。 - token动态mask(可选),python 层实时mask文本。 本目录下主要包含一下文件: ``` ├── create_pretraining_data.py ├── dataset_utils.py ├── ernie_dataset.py ├── helpers.cpp ├── Makefile ├── README.md └── trans_to_json.py ``` 其中,`trans_to_json.py`是原始数据转化的脚本,将数据转化为json串格式。 `create_pretraining_data.py`将jsonl文本,断句、分词后,tokenizer转化为token id。 `dataset_utils.py`中包含了index生成、动态mask的实现。 `ernie_dataset.py`通过调用`dataset_utils.py`的一些函数,产生ernie的输入dataset。 ### 环境依赖 - tqdm - numpy - pybind11 - tool_helpers - lac (可选) - zstandard (可选) 安装命令`pip install tqdm numpy pybind11 tool_helpers lac zstandard`。另,部分功能需要`g++>=4.8`编译支持 ## 训练全流程数据Pipeline 飞桨是自主研发、功能完备、开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体 |步骤|阶段                     |数据格式| 样例| |-|-|-|-| | 0️⃣初始状态 | -|原始数据:
    **每个doc之间用空行间隔开**
    - 中文,默认每句换行符,作为句子结束。
    - 英文,默认使用nltk判断句子结束 | ```飞桨是功能完备、开源开放的产业级深度学习平台。```
    ```飞桨拥有核心训练和推理框架、基础模型库。```

    ```PaddleNLP是自然语言处理领域的优秀工具。``` | |1️⃣原始数据转换
    `trans_to_json.py`|预处理
    输入:0️⃣初始状态
    输出:jsonl|jsonl格式:每个doc对应一行json字符串| ```{"text": "飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有..."}```
    ```{"text": "PaddleNLP是自然语言..."}``` |❇️(**可选**)数据中文分词
    `words_segmentation.py`|语料分词:中文WWM
    输入:jsonl
    输出:0️⃣初始状态| 将jsonl格式的数据,恢复成分词后的原始格式数据
    | ```飞桨 是 功能 完备、开源 开放的 产业级 深度学习 平台。```
    ```飞桨 拥有 核心 训练和推理 框架、基础 模型库。```

    ```PaddleNLP 是 自然语言处理领域 的 优秀工具。``` |2️⃣数据ID化
    `create_pretrain_data.py`|预处理| npy格式:数据id化后的token id
    npz格式:数据句子、文章位置索引 | - |3️⃣训练index文件生成|训练启动|npy格式:
    根据训练步数max_steps生成
    train、valid、test的每个样本索引文件| - |4️⃣token动态mask(可选)| Dataset取数据 | 无 |- 注意: - **❇️(**可选**)数据中文分词** 是中文预训练做 WWM 的可选步骤 - 当你的数据比较少时,分词耗时较少,不需要词步骤。直接在`create_pretrain_data.py`步骤中分词即可。 - 目的是为了提前分词,加快后续数据ID转化步骤。 - 如果这里输入的是 jsonl格式文件,最好为多文件,`trans_to_json.py` 时候开启`no-merge`选项。 - 当你的数据集比较大,或者需要尝试多次转换数据的时候,提前分词可以避免`create_pretrain_data.py`时每次都运行一次分词程序。 - 转换后,需要重新 进行步骤 1️⃣`原始数据转换 trans_to_json.py`,最后2️⃣`数据ID化`步骤设置`--cn_splited=True`参数。 - 2️⃣`数据ID化`也可以在转化ID的同时,一起实现分词。不需要❇️`数据中文分词`步骤。 ## 数据教程汇总 针对目前开源的数据集,PaddleFleetX提供了详细的数据教程,点击对应数据集的链接,即可开始进行数据制作: | 名称 | 文本类型 | 纯文本大小 | 适配模型 |-|-|-|-| | [CLUECorpusSmall](./docs/CLUECorpusSmall.md)| 中文 | 14GB | ERNIE | [OpenWebText2](./docs/OpenWebText2.md) | 英文 | 70GB | GPT | [WuDaoCorpus2.0 Base](./docs/WuDaoCorpusBase.md)| 中文 | 200GB | ERNIE | [CLUECorpus2020](./docs/CLUECorpus2020.md)| 中文 | 200GB | ERNIE ## ERNIE预训练详细准备 下面以ERNIE预训练为例,简要介绍一下预训练的全流程。 ### 原始数据 首先下载样例数据: ``` cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 mkdir preprocess && cd preprocess wget https://bj.bcebos.com/paddlenlp/models/transformers/data_tools/baike.txt cd .. ``` ### 原始数据转换 jsonl 格式 使用`trans_to_json.py`转化为json串格式,下面是脚本的使用说明 ``` optional arguments: -h, --help show this help message and exit --input_path INPUT_PATH Path to you raw files. Folder or file path. 必须设置,可以是文件夹或者单个文件。文件夹中的目录默认最多搜索两层子目录。 --output_path OUTPUT_PATH Path to save the output json files. 必须设置,输出文件的名字。 --json_key JSON_KEY The content key of json file. 建议不修改,默认的key是text --doc_spliter DOC_SPLITER Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank. 根据实际情况修改,默认空行作为文章换行符。 --min_doc_length MIN_DOC_LENGTH Minimal char of a documment. 可选。过滤掉长度多短的文章,默认值10 --workers WORKERS Number of worker processes to launch 可选。多进程转化文件,适用于 input_path 中包含的文件数据较多的情况。每个文件,分配给不同worker处理 --log_interval LOG_INTERVAL Interval between progress updates. 可选。此处的interval是值处理完文件个数的间隔。 --no-merge Don't merge the file. 可选。默认不开启这个选项,默认每个文件转换的jsonl文本,会拼接成到同一个文件。 --no-shuffle Don't shuffle the file. 可选。默认不开启这个选项,默认对处理完进行shuffle。 ``` 根据说明,我们使用下面简单命令,可以得到`baike_sample.jsonl`文件。此处,我们对文章所有doc进行了shuffle。 ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py --input_path ./preprocess --output_path preprocess/baike_sample #查看数据 head -1 baike_sample.jsonl {"text": "中国效仿西方发展工业的过程,于中华民国国民政府成立后至中日战争开战前夕已顺畅发展,尽管其间受到内外因素的多重干扰。尔后直至中日战争和国共战争的结束, 中国始有较为长期的和平发展时期。\n1980年代以来,邓小平政府宣布改革开放,开始实行社会主义市场经济并推行经济体制改革。中国大陆近年至2010年,GDP超过72000亿美元, 已经成为美国之后的世界第二经济大国,普遍认为中国是世界上发展速度最快的经济体,但是人均国民生产总值仍位于世界中等水平(第89位),并逐渐受到资源限制和贫富差距加 大的制约。中华人民共和国省份中,广东为GDP最高的第一强省,浙江为人均收入最高的第一富省。中国大陆、香港、澳门、台湾之间的经济联系在全球化的过程中日益紧密。\n"} ``` ### 数据ID化 本部分,我们使用 `create_pretraining_data.py` 脚本将前面得到的 `baike_sample.jsonl` 进行tokenize id化处理。 ``` optional arguments: -h, --help show this help message and exit --model_name MODEL_NAME What model to use. 必须设置,如:ernie-1.0-base-zh, 可以参考已有的模型名称 https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer} What type of tokenizer to use. 模型对应的tokenizer, 目前暂时只支持 ERNIE,BERT,GPT data input/output: --input_path INPUT_PATH Path to input JSON files. 必须设置,输入文件jsonl的目录 --output_prefix OUTPUT_PREFIX Output prefix to store output file. 必须设置,输出文件的名称。 假设名称为XXX,则会输出 XXX_ids.npy, XXX_idx.npz 两个文件。 npy文件,数据id化后的token ids; npz文件,数据句子、文章位置索引。 --data_format {JSON} Only support json format for now. One document per line. 不需要设置。目前默认处理jsonl数据格式 --json_key JSON_KEY For JSON format. Space separate listed of keys to extract from json 文本串json的key值。同前面trans_to_json.py的json_key,默认text为key --split_sentences Split documents into sentences. 是否需要将文章划分成句子。一般而言,GPT不需要,BERT/ERNIE模型需要 chinese words: --chinese Is corpus need words segmentation step for chinese words. 中文情形必须设置。处理的文本类型是否是中文。 --cn_whole_word_segment Is corpus need words segmentation step for chinese words WWM. 可选。是否需要WWM策略。一般而言,BERT/ERNIE模型需要,GPT不需要。 --cn_seg_func {lac,seg,jieba} Words segment function for chinese words. 默认jieba,jieba速度较快,lac模型更准确,计算量高。 --cn_splited Is chinese corpus is splited in to words. 分词后的文本,可选。设置此选项则,cn_seg_func不起作用。 例如分词后文本串 "中国 效仿 西方 发展 工业 的过 程" --cn_split_dimer CN_SPLIT_DIMER Split dimer between chinese words. 配合cn_splited使用,默认空格表示分词间隔。 common config: --append_eos Append an token to the end of a document. gpt模型专用,gpt设置此选项,表示doc结束。 --log_interval LOG_INTERVAL Interval between progress updates 打印日志间隔,interval表示处理 文本行数/doc数的 间隔。 --workers WORKERS Number of worker processes to launch 处理文本id化的进程个数。 ``` 通过下面脚本转化,我们可以得到处理好的预训练数据,token ids:`baike_sample_ids.npy`, 文章索引信息`baike_sample_idx.npz`. ``` cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \ --model_name ernie-1.0-base-zh \ --tokenizer_name ErnieTokenizer \ --input_path preprocess/baike_sample.jsonl \ --split_sentences\ --chinese \ --cn_whole_word_segment \ --output_prefix preprocess/baike_sample \ --workers 1 \ --log_interval 5 ``` 1. 如果您使用已经分好词的语料,可以设置 --cn_splited 为 True,同时指定--cn_split_dimer如空格。 2. 使用自定义词表的话,请指定model_name为词表所在的文件夹地址。 ### ERNIE 预训练开始 得到了处理好的训练数据,拷贝到data目录,即可开始ERNIE模型预训练。 ``` cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 mkdir data mv ./preprocess/baike_sample* ./data sh ./projects/ernie/pretrain_ernie_base.sh # 建议修改 pretrain_ernie_base.sh 中的配置,将max_steps设置小一些。 ``` 代码说明: - ernie预训练使用的 dataset 代码文件在 `ernie_dataset.py` - 数据集index生成,动态mask相关代码实现在`dataset_utils.py` 用户可以根据自己的需求,灵活修改mask方式。具体可以参考`dataset_utils.py`中`create_masked_lm_predictions`函数。 可以自定义的选项有do_whole_word_mask, favor_longer_ngram, do_permutation, geometric_dist等, 可以参考[Megatron](https://github.com/NVIDIA/Megatron-LM)使用这些lm_mask策略。 ### FAQ #### C++代码编译失败怎么办? - 请先检查pybind11包是否安装,g++、make工具是否正常。 - 编译失败可能是本文件夹下的Makefile命令出现了一些问题。可以将Makefile中的python3、python3-config设置成完全的路径,如/usr/bin/python3.7。 ## 参考内容 注: 大部分数据流程,参考自[Megatron](https://github.com/NVIDIA/Megatron-LM),特此表达感谢。 ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import io import re import argparse import json import multiprocessing import sys import time import numpy as np from tqdm import tqdm import paddlenlp.transformers as tfs try: import nltk nltk_available = True except ImportError: nltk_available = False def get_args(): parser = argparse.ArgumentParser() parser.add_argument( '--model_name', type=str, required=True, help='What model to use.') parser.add_argument( '--tokenizer_name', type=str, required=True, choices=[ 'ErnieTokenizer', 'BertTokenizer', 'GPTTokenizer', 'GPTChineseTokenizer', 'ElectraTokenizer' ], help='What type of tokenizer to use.') group = parser.add_argument_group(title='data input/output') group.add_argument( '--input_path', type=str, required=True, help='Path to input JSON files.') group.add_argument( '--output_prefix', type=str, required=True, help='Output prefix to store output file.') group.add_argument( '--data_format', type=str, default='text', choices=['JSON'], help='Only support json format for now. One document per line.') group.add_argument( '--json_key', type=str, default='text', help='For JSON format. Space separate listed of keys to extract from json' ) group.add_argument( '--split_sentences', action='store_true', help='Split documents into sentences.') group = parser.add_argument_group(title='chinese words') group.add_argument( '--chinese', action='store_true', help="Is corpus need words segmentation step for chinese words.") group.add_argument( '--cn_whole_word_segment', action='store_true', help="Is corpus need words segmentation step for chinese words WWM.") group.add_argument( '--cn_seg_func', type=str, default='jieba', choices=['lac', 'seg', 'jieba'], help='Words segment function for chinese words.') group.add_argument( '--cn_splited', action='store_true', help="Is chinese corpus is splited in to words.") group.add_argument( '--cn_split_dimer', type=str, default=' ', help="Split dimer between chinese words.") group = parser.add_argument_group(title='common config') group.add_argument( '--append_eos', action='store_true', help='Append an token to the end of a document.') group.add_argument( '--log_interval', type=int, default=100, help='Interval between progress updates') group.add_argument( '--workers', type=int, default=1, help='Number of worker processes to launch') args = parser.parse_args() return args def lexical_analysis_fn(): from LAC import LAC lac = LAC(mode="lac") def process(line): words, _ = lac.run(line) return words return process def chinese_segmentation_fn(): from LAC import LAC lac_cws = LAC(mode='seg') def process(line): words = lac_cws.run(line) return words return process def jieba_segmentation_fn(): import jieba def process(line): words = jieba.cut(line) return list(words) return process CHINESE_SEG_FUNC = { 'lac': lexical_analysis_fn(), 'seg': chinese_segmentation_fn(), 'jieba': jieba_segmentation_fn(), } def get_whole_word_mask_tokens(tokens, words, max_word_length=6): """ Do whole word mask on Chinese word. First, we do Chinese word segmentation on the sequence of tokens, which are from the WordPiece tokenization. Then, we add the '##' mark on chinese characters which are in the middle of Chinese words. And if the tokens are not chinese characters, we just exploit the results of WordPiece tokenization as words. Such as, - text line : 通过利用mercer核,将样本从输入空间映射到高维特征空间,使原来没有显现的特征突现出来,取得了很好的图像分割效果。 - the input tokens (after WordPiece): ['通', '过', '利', '用', 'me', '##rc', '##er', '核', ',', '将', '样', '本', '从', '输', '入', '空', '间', '映', '射', '到', '高', '维', '特', '征', '空', '间', ',', '使', '原', '来', '没', '有', '显', '现', '的', '特', '征', '突', '现', '出', '来', ',', '取', '得', '了', '很', '好', '的', '图', '像', '分', '割', '效', '果', '。'] - the Chinese words (after Chinese word segmentation like jieba) ['通过', '利用', 'mercer', '核', ',', '将', '样本', '从', '输入', '空间', '映射', '到', '高维', '特征', '空间', ',', '使', '原来', '没有', '显现', '的', '特征', '突现', '出来', ',', '取得', '了', '很', '好', '的', '图像', '分割', '效果', '。'] - the output whole word mask tokens: ['通', '##过', '利', '##用', 'me', '##rc', '##er', '核', ',', '将', '样', '##本', '从', '输', '##入', '空', '##间', '映', '##射', '到', '高', '##维', '特', '##征', '空', '##间', ',', '使', '原', '##来', '没', '##有', '显', '##现', '的', '特', '##征', '突', '##现', '出', '##来', ',', '取', '##得', '了', '很', '好', '的', '图', '##像', '分', '##割', '效', '##果', '。'] Args: tokens(list(str)): The sequence of tokens, which are from the WordPiece tokenization. words(list(str)): The sequence of Chinese words. max_word_length(int, optional): The maximum chinese character in Chinese words. It avoids too long Chinese word to be masked. Defaults as 4. Returns: new_tokens(list(str)): The new token will be done with whole word masking strategy. """ new_tokens = [] # opt for long document words_set = set(words) i = 0 while i < len(tokens): # non-chinese character, then do word piece if len(re.findall('[\u4E00-\u9FA5]', tokens[i])) == 0: new_tokens.append(tokens[i]) i += 1 continue # add "##" mark on the middel tokens of Chinese words # such as ["通过", "利用"] -> ["通", "##过", "利", "##用"] has_add = False for length in range(max_word_length, 0, -1): if i + length > len(tokens): continue if ''.join(tokens[i:i + length]) in words_set: new_tokens.append(tokens[i]) for l in range(1, length): new_tokens.append('##' + tokens[i + l]) i += length has_add = True break if not has_add: new_tokens.append(tokens[i]) i += 1 return new_tokens class IdentitySplitter(object): def tokenize(self, *text): return text class NewlineSplitter(): def tokenize(self, text): return text.split("\n") class Converter(object): def __init__(self, args): self.args = args def initializer(self): Converter.tokenizer = getattr( tfs, self.args.tokenizer_name).from_pretrained(self.args.model_name) if self.args.cn_whole_word_segment: # Extend chinese char vocab for ErnieTokinzer Converter.tokenizer.extend_chinese_char() # Split document to sentence. if self.args.split_sentences: if self.args.chinese: Converter.splitter = NewlineSplitter() else: if not nltk_available: print("NLTK is not available to split sentences.") exit() splitter = nltk.load("tokenizers/punkt/english.pickle") Converter.splitter = splitter else: Converter.splitter = IdentitySplitter() # Split sentence whole words mask for chinese if self.args.cn_whole_word_segment: if self.args.cn_splited: Converter.segment_func = lambda text: text.split(self.args.cn_split_dimer) else: Converter.segment_func = CHINESE_SEG_FUNC[ self.args.cn_seg_func] Converter.whole_word_mask = get_whole_word_mask_tokens else: Converter.segment_func = lambda x: x Converter.whole_word_mask = lambda x, y: x def process(text): words = Converter.segment_func(text) # if there are two empty word, the should a split dimer in the pos if self.args.cn_splited: pre_dimer = False for index, w in enumerate(words): if pre_dimer and len(w) == 0: words[index] = self.args.cn_split_dimer pre_dimer = False elif len(w) == 0: pre_dimer = True else: pre_dimer = False tokens = Converter.tokenizer.tokenize("".join(words)) tokens = Converter.whole_word_mask(tokens, words) tokens = Converter.tokenizer.convert_tokens_to_ids(tokens) return tokens Converter.process = process def encode(self, json_line): text = json.loads(json_line)[self.args.json_key] doc_ids = [] for sentence in Converter.splitter.tokenize(text): sentence_ids = Converter.process(sentence.strip()) if len(sentence_ids) > 0: doc_ids.append(sentence_ids) if len(doc_ids) > 0 and self.args.append_eos: doc_ids[-1].append(Converter.tokenizer.eos_token_id) return doc_ids, len(text.encode("utf-8")) def main(): args = get_args() file_paths = [] if os.path.isfile(args.input_path): file_paths.append(args.input_path) else: for root, _, fs in os.walk(args.input_path): for f in fs: file_paths.append(os.path.join(root, f)) convert = Converter(args) # Try tokenizer is availiable sample_tokenizer = getattr( tfs, args.tokenizer_name).from_pretrained(args.model_name) if sample_tokenizer.vocab_size < 2**16 - 1: save_dtype = np.uint16 else: save_dtype = np.int32 pool = multiprocessing.Pool(args.workers, initializer=convert.initializer) # We use BytesIO to store the ids. token_ids_stream = io.BytesIO() sentlens_stream = io.BytesIO() # # Cumsum on tokens num # sent_cumsum_stream = io.BytesIO() # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True)) # Cunsum on document on every sentence num, type=np.int64 doc_cumsum_stream = io.BytesIO() doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True)) sent_count = 0 # token_count = 0 file_paths.sort() step = 0 total_bytes_processed = 0 startup_start = time.time() for file_path in tqdm(file_paths): if file_path.endswith(".zst"): import zstandard cctx = zstandard.ZstdDecompressor() fh = open(file_path, 'rb') text = io.BufferedReader(cctx.stream_reader(fh)) elif file_path.endswith(".jsonl"): text = open(file_path, 'r', encoding='utf-8') else: print("Unexpected data format, skiped %s" % file_path) continue encoded_docs = pool.imap(convert.encode, text, 256) print("Processing %s" % file_path) for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): step += 1 total_bytes_processed += bytes_processed if len(doc) == 0: continue for sentence in doc: sentence_len = len(sentence) if sentence_len == 0: continue sentlens_stream.write( sentence_len.to_bytes( 4, byteorder='little', signed=True)) # token_count += sentence_len # sent_cumsum_stream.write( # token_count.to_bytes( # 8, byteorder='little', signed=True)) sent_count += 1 token_ids_stream.write( np.array( sentence, dtype=save_dtype).tobytes(order='C')) doc_cumsum_stream.write( sent_count.to_bytes( 8, byteorder='little', signed=True)) if step % args.log_interval == 0: current = time.time() elapsed = current - startup_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print( f"Processed {step} documents", f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).", file=sys.stderr) pool.close() print("Saving tokens to files...") all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype) lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32) # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64) docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64) np.save(args.output_prefix + "_ids.npy", all_doc_ids) # np.savez(args.output_prefix + "_idx.npz", lens=lens, sents=sents, docs=docs) np.savez(args.output_prefix + "_idx.npz", lens=lens, docs=docs) print("Total sentences num: %d" % len(lens)) print("Total documents num: %d" % (len(docs) - 1)) print("Total tokens num: %d" % len(all_doc_ids)) print("Average tokens per sentence: %.2f" % (len(all_doc_ids) / len(lens))) print("Average tokens per document: %.2f" % (len(all_doc_ids) / (len(docs) - 1))) if __name__ == "__main__": main() ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpus2020.md ================================================ ## CLUECorpus2020 语料 | 名称 | 文本类型 | 纯文本大小 | |-|-|-| | CLUECorpus2020| 中文 | 200GB | CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本,详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD),用户可以通过邮件申请下载,方式如下: > 数据下载 > 申请方式: 将使用语料研究目的和用途,计划、研究机构和申请者介绍,发送到邮箱,并承诺不向第三方提供。 > > 邮箱: CLUEbenchmark@163.com,标题是:CLUECorpus2020 200G语料库 ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpusSmall.md ================================================ # CLUECorpusSmall | 名称 | 文本类型 | 纯文本大小 | |-|-|-| | CLUECorpusSmall| 中文 | 14GB | **数据集简介**:可用于语言建模、预训练或生成型任务等,数据量超过14G,近4000个定义良好的txt文件、50亿个字。主要部分来自于nlp_chinese_corpus项目 包含如下子语料库(总共14G语料):新闻语料[news2016zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/6bac09db4e6d4857b6d680d34447457490cb2dbdd8b8462ea1780a407f38e12b?responseContentDisposition=attachment%3B%20filename%3Dnews2016zh_corpus.zip), 社区互动语料[webText2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/83da03f7b4974871a52348b41c16c7e3b34a26d5ca644f558df8435be4de51c3?responseContentDisposition=attachment%3B%20filename%3DwebText2019zh_corpus.zip),维基百科语料[wiki2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/d7a166408d8b4ffdaf4de9cfca09f6ee1e2340260f26440a92f78134d068b28f?responseContentDisposition=attachment%3B%20filename%3Dwiki2019zh_corpus.zip),评论数据语料[comment2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/b66ddd445735408383c42322850ac4bb82faf9cc611447c2affb925443de7a6d?responseContentDisposition=attachment%3B%20filename%3Dcomment2019zh_corpus.zip)。 ## 数据获取 用户可以通过官方github网页下载,https://github.com/CLUEbenchmark/CLUECorpus2020 。同时,为方便用户,我们也提供了aistudio数据集下载地址。[part1](https://aistudio.baidu.com/aistudio/datasetdetail/60598),[part2](https://aistudio.baidu.com/aistudio/datasetdetail/124357)。使用aistudio版本的数据,下载好后,可以核对md5值: ```shell > md5sum ./* 8a8be341ebce39cfe9524fb0b46b08c5 ./comment2019zh_corpus.zip 4bdc2c941a7adb4a061caf273fea42b8 ./news2016zh_corpus.zip fc582409f078b10d717caf233cc58ddd ./webText2019zh_corpus.zip 157dacde91dcbd2e52a60af49f710fa5 ./wiki2019zh_corpus.zip ``` 解压文件 ```shell unzip comment2019zh_corpus.zip -d clue_corpus_small_14g/comment2019zh_corpus unzip news2016zh_corpus.zip -d clue_corpus_small_14g/news2016zh_corpus unzip webText2019zh_corpus.zip -d clue_corpus_small_14g/webText2019zh_corpus unzip wiki2019zh_corpus.zip -d clue_corpus_small_14g/wiki2019zh_corpus ``` 将txt文件转换为jsonl格式 ``` cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl ``` 现在我们得到了jsonl格式的数据集。 ## ERNIE 中文预训练数据制作 下面是针对训练任务的数据集应用,此处以ernie为例。 ``` python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \ --model_name ernie-1.0-base-zh \ --tokenizer_name ErnieTokenizer \ --input_path clue_corpus_small_14g.jsonl \ --split_sentences \ --chinese \ --cn_whole_word_segment \ --cn_seg_func jieba \ --output_prefix clue_corpus_small_14g_20220104 \ --workers 48 \ --log_interval 10000 ``` - model_name 可以更换为其他 ERNIE 系列模型,如: `ernie-3.0-base-zh` - workers 表示转化的线程数目 数据共有文档`15702702`条左右,由于分词比较耗时,大概一小时左右可以完成。在当前目录下产出训练所需数据。 ``` clue_corpus_small_14g_20220104_ids.npy clue_corpus_small_14g_20220104_idx.npz ``` 用户可以使用此数据进行预训练任务。 ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/OpenWebText2.md ================================================ # OpenWebText2 | 名称 | 文本类型 | 纯文本大小 | |-|-|-| | OpenWebText2 | 英文 | 70GB | ## 数据获取 [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/)是一个开源的英文网页文本数据集,数据来源于Reddit,经过去重、清洗、提取,最终包含800多万个文档。 本示例采用EleutherAI清洗好的[OpenWebText2数据](https://openwebtext2.readthedocs.io/en/latest/index.html#download-plug-and-play-version) 下载以后通过以下命令解压: ```shell wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar tar -xvf openwebtext2.json.zst.tar -C /path/to/openwebtext ``` ## GPT训练数据制作 然后使用[proprecess]](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/ernie/preprocess) 工具下的`create_pretraining_data.py`脚本进行数据集制作: ``` python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \ --model_name gpt2-en \ --tokenizer_name GPTTokenizer \ --data_format JSON \ --input_path /path/to/openwebtext/ \ --append_eos \ --output_prefix gpt_openwebtext \ --workers 40 \ --log_interval 10000 ``` 处理时间约一个小时左右,就可以得到我们需要的`gpt_openwebtext_ids.npy`, `gpt_openwebtext_idx.npz`数据集文件。 为了方便用户运行测试本模型,本项目提供了处理好的300M的训练样本: ```shell wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz ``` 将所有预处理得到的文件统一放入一个文件夹中,以备训练使用: ``` mkdir data mv gpt_en_dataset_300m_ids.npy ./data mv gpt_en_dataset_300m_idx.npz ./data ``` ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/docs/WuDaoCorpusBase.md ================================================ # WuDaoCorpus2.0 Base 语料 | 名称 | 文本类型 | 纯文本大小 | |-|-|-| | WuDaoCorpus2.0 Base| 中文 | 200GB | WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB,目前开源的部分为WuDaoCorpus2.0 bases数据集,大小为200GB。 ## 数据获取 **1. 下载解压** 用户微信登录[官网](https://resource.wudaoai.cn/home),即可直接下载数据。下载好的压缩数据约 64GB。解压 ``` unrar x WuDaoCorpus2.0_base_200G.rar ``` **2. 语料分词** 由于WuDao数据集比较大,分词比较耗时,这里先进行了语料分词: ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python ./ppfleetx/data/data_tools/ernie/preprocess/words_segmentation.py \ --input_path ./WuDaoCorpus2.0_base_200G \ --workers 40 \ --data_format wudao \ --cn_seg_func seg \ --output_path ./wudao_lac_cut \ ``` 注:预训练需要实现 SOP( Sentence Order Predict) 任务,在分词的同时,我们使用 简单规则 进行了文本断句。如果语料只有一句话,建议去除SOP loss,训练时设置 `binary_head=False`。 **3. 转换为jsonl格式** 文本转化完成后。我们使用 `ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py`重新转换为jsonl格式(分词完毕)。 ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py \ --input_path ./wudao_lac_cut \ --output_path wudao_corpus_200g_0623.jsonl \ --workers 40 ``` 在当前目录下产出数据`wudao_corpus_200g_0623.jsonl`。格式如下: ``` {"text": "主持人 : 作为 一个 曲线救国 的 路线 我们 没 办法 。\n金鑫 : 考试 和 分数 只是 一个 阶段性 的 评价 手段 , 不是 目的 , 就 像 人 活着 的 目的 不是 为了 吃饭 , 吃饭 是 为了 让 我们 活下去 , 我们 学习 的 目的 不是 为了 考试 , 不是 为了 那个 分数 , 而是 我 掌握 了 知识 , 成为 我 内在 的 能力 , 将来 我 去 创作 创造 工作 , 我能 把 它 做 得 更好 。\n主持人 : 特别感谢 金总 今天 接受 我 的 访谈 , 也 让 我 从 别的 层面 看到 了 一对一 到底 存在 的 道理 是 什么 , 并且 能 发展 那么 好 的 原因 在 哪里 。\n在 节目 后 您 谈谈 您 对 一对一 未来 的 希望 , 包括 您 对 它 未来 的 设想 是 什么 ?\n金鑫 : 一对一 个性化 教育 现在 还是 在 初级阶段 , 如果 是 四个 阶段 的话 , 现在 还是 在 第一阶段 到 第二阶段 迈进 的 , 学大 在 这方面 我们 希望 能 做 得 更 快 更 远 一些 。\n将来 个性化 教育 一定 是 能够 帮助 学生 在 成绩 上 的 提升 , 能够 更好 的 成长 , 进而 成为 对 社会 对 国家 更 有用 的 人才 , 就是 我们 的 成绩 、 成长 、 成才 。\n学大 1 对 1 教育 的 教师 团队 由 各科 优秀教师 、 考试 指导 专家 、 心理 辅导 专家 及 学习 方法 指导 专家 组成 , 同时 配备 专职 班主任 及 学习 监管 师 , 全方位 辅导 顺利 而 有序 的 运作 。\n其中 部分 教师 担任 多年 毕业班 教学 工作 , 多次 参与 中 考试 命题 研究 及 阅卷 工作 , 深谙 中 考试 精髓 , 能够 在 短 的 时间 内 引领 学生 掌握 中 考试 知识 重点 , 快速 提分 。\n■ 对于 成绩 差 的 学生 : 注重 学生 基础知识 , 力求 让 学生 在 基础 中 找 自信 , 在 自信 中 提升 ;\n注重 主观题 的 解题 方法 及 思路 , 以此 来 加强 对 基础知识 的 运用 。\n■ 对于 成绩 需要 拔高 的 学生 : 找出 学生 弱点 , 加强 基础 , 重点 提高 弱势 项目 。\n"} {"text": "武田信玄 是 天生 的 武将 , 一生 开拓 了 八十五万 石至 九十余万 石之多 的 领地 。\n武田信玄 他 21 岁 时 流放 自己 的 父亲 武田信虎 至骏河 , 避免 父亲 传位 给 弟弟 , 从而 登上 了 第 19 代家督 之位 。\n他 将 信 浓国 ( 现 长野县 ) 纳入 控制 范围 后 , 又 与 当时 的 豪强 今井氏 、 北条 氏 结成 三国 军事同盟 , 与 上 杉谦信 在 川 中岛 前后 展开 了 五次 大战 。\n武田信玄 勇于 进攻 。\n他 连续 攻打 邻国 , 扩大 自己 势力范围 , 可称 遇神 杀神 , 遇佛 杀佛 。\n他 不仅 流放 了 自己 的 父亲 , 连 自己 的 嫡子 武田义信 因 与 他 在 战略 方向 上 相左 , 也 被 他 幽禁 于 佛寺 , 随即 被迫 自杀 。\n武田信玄 虽然 是 战国 武将 中 的 最强者 , 但 他 的 弱点 是 年龄 。\n信玄比 织田信长 年长 13 岁 , 比上 杉谦信 年长 9 岁 。\n当信 玄年 届 五十 之 时 , 信长 和 谦信 犹 在 壮年 。\n上杉谦信 而且 , 武田信玄 虽 驰骋 天下 , 却 未率 军 进过 京都 , 而 织田信长 在 永禄 十一年 ( 1568 年 ) 就 以 拥立 第 15 代 将军 足利义 昭 为名 率兵 上洛 了 。\n所谓 \" 制 京都 者 得 天下 \" , 所以 , 想要 一统天下 , 武田信玄 的 时间 很 紧迫 。\n元龟 三年 ( 1572 年 ) , 武田信玄 与 室 町 幕府 第 15 代 将军 足利义 昭 、 本愿 寺 显如 , 以及 浅井 氏 、 朝仓氏 等 反 织田信长 实力 组成 联盟 , 编织 \" 反信长 包围圈 \" 。\n同年 10 月 3 日 , 武田信玄 率领 大军 , 开始 了 第一次 上洛之行 。\n是 年 , 信玄 52 岁 , 这 也许 是 他 统一天下 的 最后 一次 机会 。\n武田信玄 所 率领 的 是 当时 战国 最强 的 3 万甲州 精兵 。\n打着 \" 风林火山 \" 的 旗帜 , 武田军 第一站 就 到达 了 织田信长 的 同盟 德川家康 所在 的 三河 远江 。\n织田信长 德川家康 的 军队 在 甲州 精兵 之前 显得 不堪一击 , 到 了 10 月 13 日 , 只来 成 、 天 方城 、 一 宫城 、 饭田 城 、 各和城 、 向 笠 城 等 城池 纷纷 被 攻陷 。\n德川家康 见势不妙 , 决定 在 浜松 城中 闭门不出 。\n但是 武田信玄 毫不 松懈 , 又 将 家康 在 远江 地区 的 重要 据点 二俣城 攻破 。\n德川家康 集合 所有 军队 共 1 万 1 千人 , 出城 与 信玄 决一死战 , 但 大败 而 还 , 险些 失 了 性命 。\n这次 战争 被 称为 \" 三方 原战 \" , 德川家康 曾经 承认 这次 战争 是 他 生平 最大 的 失败 。\n"} ``` ## ERNIE 中文预训练数据制作 下面是针对训练任务的数据集应用,此处以ernie为例。 ``` python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \ --model_name ernie-1.0-base-zh \ --tokenizer_name ErnieTokenizer \ --input_path wudao_corpus_200g_0623.jsonl \ --split_sentences \ --chinese \ --cn_whole_word_segment \ --cn_seg_func jieba \ --cn_splited \ --output_prefix wudao_corpus_200g_0623 \ --workers 48 \ --log_interval 10000 ``` - 我们提前分词好了,所以加上了 `cn_splited`,否则不需要使用此选项。 - model_name 可以更换为其他 ERNIE 系列模型,如: `ernie-3.0-base-zh` - workers 表示转化的线程数目 在当前目录下产出训练所需数据。 ``` wudao_corpus_200g_0623_ids.npy wudao_corpus_200g_0623_idx.npz ``` 用户可以使用此数据进行预训练任务。 ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import re import argparse import json import multiprocessing import sys import time import shutil from functools import partial import numpy as np from tqdm import tqdm def get_args(): parser = argparse.ArgumentParser() parser.add_argument( '--input_path', type=str, required=True, help='Path to you raw files. Folder or file path.') parser.add_argument( '--output_path', type=str, required=True, help='Path to save the output json files.') parser.add_argument( '--json_key', type=str, default='text', help='The content key of json file.') parser.add_argument( '--doc_spliter', type=str, default='', help="Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank." ) parser.add_argument( '--min_doc_length', type=int, default=10, help="Minimal char of a documment.") parser.add_argument( '--workers', type=int, default=1, help='Number of worker processes to launch') parser.add_argument( '--log_interval', type=int, default=1, help='Interval between progress updates.') parser.add_argument( '--no-merge', action='store_true', help='Don\'t merge the file.') parser.add_argument( '--no-shuffle', action='store_true', help='Don\'t shuffle the file.') args = parser.parse_args() return args def raw_text_to_json(path, doc_spliter="", json_key="text", min_doc_length=10): path = os.path.abspath(path) if not os.path.exists(path): print("No found file %s" % path) return 0, None out_filepath = path + ".jsonl" fout = open(out_filepath, "w", encoding="utf-8") len_files = 0 with open(path, "r") as f: doc = "" line = f.readline() while line: len_files += len(line) if line.strip() == doc_spliter: if len(doc) > min_doc_length: fout.write( json.dumps( { json_key: doc }, ensure_ascii=False) + "\n") doc = "" else: doc += line line = f.readline() if len(doc) > min_doc_length: fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + "\n") doc = "" return len_files, out_filepath def merge_file(file_paths, output_path): if not output_path.endswith(".jsonl"): output_path = output_path + ".jsonl" print("Merging files into %s" % output_path) with open(output_path, 'wb') as wfd: for f in file_paths: if f is not None and os.path.exists(f): with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd) os.remove(f) print("File save in %s" % output_path) return output_path def shuffle_file(output_path): print("Shuffling the jsonl file...") if os.path.exists(output_path): os.system("shuf %s -o %s" % (output_path, output_path)) print("File shuffled!!!") else: raise ValueError("File not found: %s" % output_path) def main(): args = get_args() startup_start = time.time() file_paths = [] if os.path.isfile(args.input_path): file_paths.append(args.input_path) else: for root, _, fs in os.walk(args.input_path): for f in fs: file_paths.append(os.path.join(root, f)) pool = multiprocessing.Pool(args.workers) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) trans_json = partial( raw_text_to_json, doc_spliter=args.doc_spliter, json_key=args.json_key, min_doc_length=args.min_doc_length) encoded_files = pool.imap(trans_json, file_paths, 1) out_paths = [] for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1): total_bytes_processed += bytes_processed out_paths.append(out_path) master_start = time.time() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print( f"Processed {i} files", f"({i/elapsed} files/s, {mbs} MB/s).", file=sys.stderr) if not args.no_merge: output_path = merge_file(out_paths, args.output_path) if not args.no_shuffle: shuffle_file(output_path) if __name__ == "__main__": main() #profile.run("main()", "testprof") ================================================ FILE: ppfleetx/data/data_tools/ernie/preprocess/words_segmentation.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import re import argparse import multiprocessing import os import time import jieba import sys from functools import partial def get_args(): parser = argparse.ArgumentParser() parser.add_argument( '--input_path', type=str, required=True, help='Path to you raw files. Folder or file path.') parser.add_argument( '--workers', type=int, default=1, help='Number of worker processes to launch') parser.add_argument( '--output_path', type=str, default="./tmp", help='Path to save the output json files.') parser.add_argument( '--data_format', type=str, default="jsonl", choices=["jsonl", "wudao"], help='Path to you raw files. Folder or file path.') parser.add_argument( '--cn_seg_func', type=str, default='jieba', choices=['lac', 'seg', 'jieba'], help='Words segment function for chinese words.') parser.add_argument( '--log_interval', type=int, default=1, help='Interval between progress updates.') args = parser.parse_args() return args def lexical_analysis_fn(): from LAC import LAC lac = LAC(mode="lac") def process(line): words, _ = lac.run(line) return words return process def chinese_segmentation_fn(): from LAC import LAC lac_cws = LAC(mode='seg') def process(line): words = lac_cws.run(line) return words return process def jieba_segmentation_fn(): import jieba def process(line): words = jieba.cut(line) return list(words) return process CHINESE_SEG_FUNC = { 'lac': lexical_analysis_fn(), 'seg': chinese_segmentation_fn(), 'jieba': jieba_segmentation_fn(), } def read_wudao(path): print("Loading %s" % path) with open(path, "r") as f: try: contents = json.load(f) except Exception as e: print("Failed to load %s" % path) raise StopIteration for js in contents: yield js["content"] def read_jsonl(path): print("Loading %s" % path) with open(path, "r") as f: line = f.readline() while line: contents = json.load(f) yield contents["text"] line = f.readline() READFILE_FUNC = { 'jsonl': read_jsonl, 'wudao': read_wudao, } special_chars = ['\n', '。', '?', '?', ' ', ';', ';', '!', '!'] split_chars = ['。', '?', '?', ';', ';', '!', '!'] def text_to_text(path, output_path, read_func, seg_func): out_name = os.path.join(output_path, path[-20:]) print("Write into %s" % out_name) if os.path.exists(out_name): print("File exists %s" % out_name) return 0, None seg_func = CHINESE_SEG_FUNC[seg_func] read_func = READFILE_FUNC[read_func] import time s = time.time() data_len = 0 count = 0 with open(out_name, "w") as f: for text in read_func(path): # for js in contents: count += 1 # text = js["content"] data_len += len(text.encode("utf-8")) # make special char only once, # because of those token will be treat as sentence spliter. # 此处为断句逻辑 for char in special_chars: text = re.sub('[' + char + ']+[ ]*', char, text) for char in split_chars: text = text.replace(char, char + "\n") # 此处为分词逻辑 final = "" for line in text.split("\n"): if len(line) == 0: continue words = seg_func(line) final += " ".join(words) + "\n" f.write(final + "\n") return data_len, None def main(): args = get_args() startup_start = time.time() file_paths = [] if os.path.isfile(args.input_path): file_paths.append(args.input_path) else: for root, _, fs in os.walk(args.input_path): for f in fs: file_paths.append(os.path.join(root, f)) pool = multiprocessing.Pool(args.workers) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) if not os.path.exists(args.output_path): os.makedirs(args.output_path) trans_func = partial( text_to_text, output_path=args.output_path, seg_func=args.cn_seg_func, read_func=args.data_format) encoded_files = pool.imap(trans_func, file_paths, 1) out_paths = [] for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1): total_bytes_processed += bytes_processed out_paths.append(out_path) master_start = time.time() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print( f"Processed {i} files", f"({i/elapsed} files/s, {mbs} MB/s).", file=sys.stderr) pool.close() if __name__ == "__main__": main() ================================================ FILE: ppfleetx/data/data_tools/gpt/README.md ================================================ ## GPT 模型预训练数据准备流程(中文数据处理正在支持中) 我们将预训练数据过程划分为以下2个部分: 1. 原始数据转换,原始文本转换为jsonl的json字符串格式。 2. 数据ID化,断句、分词、tokenize转化为token id格式。 本目录下主要包含以下文件: ``` ├── preprocess_data.py # 将jsonl文本,断句、分词后,tokenizer转化为token id。 ├── README.md # 预训练数据准备流程教程 └── raw_trans_to_json.py # 原始文本数据转化的脚本,将数据转化为json串格式。 ``` ## 目录切换 ``` # 如果您还未下载 PaddleFleetX 套件,请先 clone 套件 # git clone https://github.com/PaddlePaddle/PaddleFleetX.git cd PaddleFleetX # 以下所有命令都在 PaddleFleetX 根目录中执行 ``` ## 环境依赖 - paddlepaddle-gpu>=2.3.0 - python==3.7 - tqdm==4.54.1 - numpy==1.20.1 - pybind11==2.10.0 安装命令`pip install -r requirements.txt`。 ## 训练全流程数据 Pipeline |步骤|阶段|数据格式| 样例| |-|-|-|-| | 原始数据清洗 | 原始数据准备|原始数据:
    每个doc之间用空行间隔开
    - 中文,默认每句换行符,作为句子结束。
    - 英文,默认使用nltk判断句子结束。doc是又一段或多端文字组成,每段文字由一句或多句话文字组成。 | ```飞桨是功能完备、开源开放的产业级深度学习平台。```
    ```飞桨拥有核心训练和推理框架、基础模型库。```

    ```PaddleNLP是自然语言处理领域的优秀工具。``` | |原始数据转换
    `raw_trans_to_json.py`|预处理|jsonl格式:每个doc对应一行json字符串| ```{"text": "飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有..."}```
    ```{"text": "PaddleNLP是自然语言..."}``` |数据ID化
    `preprocess_data.py`|预处理| npy格式:数据id化后的token id
    npz格式:数据句子、文章位置索引 | - ## 全流程示例 下面以 GPT 预训练为例,简要介绍一下预训练数据处理的全流程。 ### 原始数据 首先下载样例数据: ``` mkdir -p dataset/wikitext_103_en wget -O dataset/wikitext_103_en/wikitext-103-en.txt http://fleet.bj.bcebos.com/datasets/gpt/wikitext-103-en.txt ``` ### 原始数据转换 jsonl 格式 使用`raw_trans_to_json.py`转化为json串格式,下面是脚本的使用说明 ``` optional arguments: -h, --help show this help message and exit --input_path INPUT_PATH Path to you raw files. Folder or file path. 必须设置,可以是文件夹或者单个文件。文件夹中的目录默认最多搜索两层子目录。 --output_path OUTPUT_PATH Path to save the output json files. 必须设置,输出文件的名字。 --json_key JSON_KEY The content key of json file. 建议不修改,默认的key是text --doc_spliter DOC_SPLITER Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank. 根据实际情况修改,默认空行作为文章换行符。 --min_doc_length MIN_DOC_LENGTH Minimal char of a documment. 可选。过滤掉长度多短的文章,默认值10 --workers WORKERS Number of worker processes to launch 可选。多进程转化文件,适用于 input_path 中包含的文件数据较多的情况。每个文件,分配给不同worker处理 --log_interval LOG_INTERVAL Interval between progress updates. 可选。此处的interval是值处理完文件个数的间隔。 --no-merge Don't merge the file. 可选。默认不开启这个选项,默认每个文件转换的jsonl文本,会拼接成到同一个文件。 --no-shuffle Don't shuffle the file. 可选。默认不开启这个选项,默认对处理完进行shuffle。 ``` 根据说明,我们使用下面简单命令,可以得到`wikitext_103_en.jsonl`文件。此处,我们对所有doc进行了shuffle。 ```shell python ppfleetx/data/data_tools/gpt/raw_trans_to_json.py --input_path ./dataset/wikitext_103_en --output_path ./dataset/wikitext_103_en/wikitext_103_en # output of terminal # Time to startup: 0.0075109004974365234 # Processed 1 files (0.12870440603278582 files/s, 64.80481421466284 MB/s). # Merging files into wikitext_103_en.jsonl # File save in wikitext_103_en.jsonl # Shuffling the jsonl file... # File shuffled!!! # 查看数据。因为对数据有 shuffle,下面的内容可能会不一样。 tail -1 ./dataset/wikitext_103_en/wikitext_103_en.jsonl {"text": "The album was released in June 1973 . Although it received good reviews , it did not sell well , except in Austin , where it sold more copies than earlier records by Nelson did nationwide . The recording led Nelson to a new style ; he later stated regarding his new musical identity that Shotgun Willie had \" cleared his throat . \" It became his breakthrough record , and one of the first of the outlaw movement , music created without the influence of the conservative Nashville Sound . The album — the first to feature Nelson with long hair and a beard on the cover — gained him the interest of younger audiences . It peaked at number 41 on Billboard 's album chart and the songs \" Shotgun Willie \" and \" Stay All Night ( Stay A Little Longer ) \" peaked at number 60 and 22 on Billboard Hot 100 respectively .\nRolling Stone wrote : \" With this flawless album , Willie Nelson finally demonstrates why he has for so long been regarded as a Country & Western singer @-@ songwriter 's singer @-@ songwriter ... At the age of 39 , Nelson finally seems destined for the stardom he deserves \" . Robert Christgau wrote : \" This attempt to turn Nelson into a star runs into trouble when it induces him to outshout Memphis horns or Western swing . \"\nBillboard wrote : \" This is Willie Nelson at his narrative best . He writes and sings with the love and the hurt and the down @-@ to @-@ earth things he feels , and he has a few peers . \" Texas Monthly praised Nelson and Wexler regarding the change in musical style : \" They 've switched his arrangements from Ray Price to Ray Charles — the result : a revitalized music . He 's the same old Willie , but veteran producer Jerry Wexler finally captured on wax the energy Nelson projects in person \" . School Library Journal wrote : \" Willie Nelson differs ( from ) rock artists framing their music with a country & western facade — in that he appears a honky @-@ tonk stardust cowboy to the core . This album abounds in unabashed sentimentalism , nasal singing , lyrics preoccupied with booze , religion , and love gone bad , and stereotyped Nashville instrumentation ( twangy steel guitars , fiddles , and a clean rhythm section characterized by the minimal use of bass drum and cymbals , both of which gain heavy mileage with rock performers ) .\nStephen Thomas Erlewine wrote in his review for Allmusic : \" Willie Nelson offered his finest record to date for his debut – possibly his finest album ever . Shotgun Willie encapsulates Willie 's world view and music , finding him at a peak as a composer , interpreter , and performer . This is laid @-@ back , deceptively complex music , equal parts country , rock attitude , jazz musicianship , and troubadour storytelling \" .\n"} ``` ### 数据ID化 我们使用 `preprocess_data.py` 脚本将前面得到的 `wikitext_103_en.jsonl` 进行tokenize id化处理。 ``` optional arguments: -h, --help show this help message and exit --model_name MODEL_NAME What model to use. 必须设置,如:gpt2 --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer} What type of tokenizer to use. 模型对应的tokenizer, 目前暂时只支持 Ernie,Bert,GPT data input/output: --input_path INPUT_PATH Path to input JSON files. 必须设置,输入文件jsonl的目录 --output_prefix OUTPUT_PREFIX Output prefix to store output file. 必须设置,输出文件的名称。 假设名称为XXX,则会输出 XXX_ids.npy, XXX_idx.npz 两个文件。 npy文件,数据id化后的token ids; npz文件,数据句子、文章位置索引。 --data_format {JSON} Only support json format for now. One document per line. 不需要设置。目前默认处理jsonl数据格式 --json_key JSON_KEY For JSON format. Space separate listed of keys to extract from json 文本串json的key值。同前面trans_to_json.py的json_key,默认text为key --split_sentences Split documents into sentences. 是否需要将文章划分成句子。一般而言,GPT不需要,Bert/Ernie模型需要 chinese words: --chinese Is corpus need words segmentation step for chinese words. 中文情形必须设置。处理的文本类型是否是中文。 --cn_whole_word_segment Is corpus need words segmentation step for chinese words WWM. 可选。是否需要WWM策略。一般而言,Bert/Ernie模型需要,GPT不需要。 --cn_seg_func {lac,seg,jieba} Words segment function for chinese words. 默认jieba,jieba速度较快,lac模型更准确,计算量高。 --cn_splited Is chinese corpus is splited in to words. 分词后的文本,可选。设置此选项则,cn_seg_func不起作用。 例如分词后文本串 "中国 效仿 西方 发展 工业 的过 程" --cn_split_dimer CN_SPLIT_DIMER Split dimer between chinese words. 配合cn_splited使用,默认空格表示分词间隔。 common config: --append_eos Append an token to the end of a document. gpt模型专用,gpt设置此选项,表示doc结束。 --log_interval LOG_INTERVAL Interval between progress updates 打印日志间隔,interval表示处理 文本行数/doc数的 间隔。 --workers WORKERS Number of worker processes to launch 处理文本id化的进程个数。 ``` 通过下面脚本转化,我们可以得到处理好的预训练数据,token ids:`wikitext_103_en.npy`, 文章索引信息`wikitext_103_en.npz`. 在使用 `GPTTokenizer` 时需要用到 `gpt2-vocab.json` 与 `gpt2-merges.txt`,如果没有下载缓存过这两个文件,脚本会自动下载并缓存。当遇到网络问题时,可以自行下载并将这两个文件放置在 `~/.cache/ppfleetx/` 目录下。 ``` python ppfleetx/data/data_tools/gpt/preprocess_data.py \ --model_name gpt2 \ --tokenizer_name GPTTokenizer \ --data_format JSON \ --input_path ./dataset/wikitext_103_en/wikitext_103_en.jsonl \ --append_eos \ --output_prefix ./dataset/wikitext_103_en/wikitext_103_en \ --workers 40 \ --log_interval 1000 # 处理完后 terminal 输出 # Processed 267000 documents (9843.34 docs/s, 18.4880 MB/s). # Processed 268000 documents (9869.46 docs/s, 18.5351 MB/s). # 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:27<00:00, 27.17s/it] # Saving tokens to files... # Total sentences num: 268492 # Total documents num: 268492 # Total tokens num: 114130026 # Average tokens per sentence: 425.08 # Average tokens per document: 425.08 ``` ## 参考内容 注: 大部分数据流程,参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)和[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP),特此表达感谢。 ================================================ FILE: ppfleetx/data/data_tools/gpt/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/data_tools/gpt/preprocess_data.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import io import re import argparse import json import multiprocessing import sys import time import numpy as np from tqdm import tqdm try: from ppfleetx.data import tokenizers as tfs except ImportError: __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../../../'))) from ppfleetx.data import tokenizers as tfs from ppfleetx.utils.log import logger try: import nltk nltk_available = True except ImportError: nltk_available = False CHINESE_SEG_FUNC = {} def get_args(): parser = argparse.ArgumentParser() parser.add_argument( '--model_name', type=str, required=True, help='What model to use.') parser.add_argument( '--tokenizer_name', type=str, required=True, choices=[ 'ErnieTokenizer', 'BertTokenizer', 'GPTTokenizer', 'GPTChineseTokenizer', 'ElectraTokenizer' ], help='What type of tokenizer to use.') group = parser.add_argument_group(title='data input/output') group.add_argument( '--input_path', type=str, required=True, help='Path to input JSON files.') group.add_argument( '--output_prefix', type=str, required=True, help='Output prefix to store output file.') group.add_argument( '--data_format', type=str, default='text', choices=['JSON'], help='Only support json format for now. One document per line.') group.add_argument( '--json_key', type=str, default='text', help='For JSON format. Space separate listed of keys to extract from json' ) group.add_argument( '--split_sentences', action='store_true', help='Split documents into sentences.') group = parser.add_argument_group(title='chinese words') group.add_argument( '--chinese', action='store_true', help="Is corpus need words segmentation step for chinese words.") group.add_argument( '--cn_whole_word_segment', action='store_true', help="Is corpus need words segmentation step for chinese words WWM.") group.add_argument( '--cn_seg_func', type=str, default='jieba', choices=['lac', 'seg', 'jieba'], help='Words segment function for chinese words.') group.add_argument( '--cn_splited', action='store_true', help="Is chinese corpus is splited in to words.") group.add_argument( '--cn_split_dimer', type=str, default=' ', help="Split dimer between chinese words.") group = parser.add_argument_group(title='common config') group.add_argument( '--append_eos', action='store_true', help='Append an token to the end of a document.') group.add_argument( '--log_interval', type=int, default=100, help='Interval between progress updates') group.add_argument( '--workers', type=int, default=1, help='Number of worker processes to launch') args = parser.parse_args() if args.chinese: global CHINESE_SEG_FUNC CHINESE_SEG_FUNC['lac'] = lexical_analysis_fn() CHINESE_SEG_FUNC['seg'] = chinese_segmentation_fn() CHINESE_SEG_FUNC['jieba'] = jieba_segmentation_fn() return args def lexical_analysis_fn(): from LAC import LAC lac = LAC(mode="lac") def process(line): words, _ = lac.run(line) return words return process def chinese_segmentation_fn(): from LAC import LAC lac_cws = LAC(mode='seg') def process(line): words = lac.run(line) return words return process def jieba_segmentation_fn(): import jieba def process(line): words = jieba.cut(line) return list(words) return process def get_whole_word_mask_tokens(tokens, words, max_word_length=4): """ Do whole word mask on Chinese word. First, we do Chinese word segmentation on the sequence of tokens, which are from the WordPiece tokenization. Then, we add the '##' mark on chinese characters which are in the middle of Chinese words. And if the tokens are not chinese characters, we just exploit the results of WordPiece tokenization as words. Such as, - text line : 通过利用mercer核,将样本从输入空间映射到高维特征空间,使原来没有显现的特征突现出来,取得了很好的图像分割效果。 - the input tokens (after WordPiece): ['通', '过', '利', '用', 'me', '##rc', '##er', '核', ',', '将', '样', '本', '从', '输', '入', '空', '间', '映', '射', '到', '高', '维', '特', '征', '空', '间', ',', '使', '原', '来', '没', '有', '显', '现', '的', '特', '征', '突', '现', '出', '来', ',', '取', '得', '了', '很', '好', '的', '图', '像', '分', '割', '效', '果', '。'] - the Chinese words (after Chinese word segmentation like jieba) ['通过', '利用', 'mercer', '核', ',', '将', '样本', '从', '输入', '空间', '映射', '到', '高维', '特征', '空间', ',', '使', '原来', '没有', '显现', '的', '特征', '突现', '出来', ',', '取得', '了', '很', '好', '的', '图像', '分割', '效果', '。'] - the output whole word mask tokens: ['通', '##过', '利', '##用', 'me', '##rc', '##er', '核', ',', '将', '样', '##本', '从', '输', '##入', '空', '##间', '映', '##射', '到', '高', '##维', '特', '##征', '空', '##间', ',', '使', '原', '##来', '没', '##有', '显', '##现', '的', '特', '##征', '突', '##现', '出', '##来', ',', '取', '##得', '了', '很', '好', '的', '图', '##像', '分', '##割', '效', '##果', '。'] Args: tokens(list(str)): The sequence of tokens, which are from the WordPiece tokenization. words(list(str)): The sequence of Chinese words. max_word_length(int, optional): The maximum chinese character in Chinese words. It avoids too long Chinese word to be masked. Defaults as 4. Returns: new_tokens(list(str)): The new token will be done with whole word masking strategy. """ new_tokens = [] # opt for long document words_set = set(words) i = 0 while i < len(tokens): # non-chinese character, then do word piece if len(re.findall('[\u4E00-\u9FA5]', tokens[i])) == 0: new_tokens.append(tokens[i]) i += 1 continue # add "##" mark on the middel tokens of Chinese words # such as ["通过", "利用"] -> ["通", "##过", "利", "##用"] has_add = False for length in range(max_word_length, 0, -1): if i + length > len(tokens): continue if ''.join(tokens[i:i + length]) in words_set: new_tokens.append(tokens[i]) for l in range(1, length): new_tokens.append('##' + tokens[i + l]) i += length has_add = True break if not has_add: new_tokens.append(tokens[i]) i += 1 return new_tokens class IdentitySplitter(object): def tokenize(self, *text): return text class NewlineSplitter(): def tokenize(self, text): return text.split("\n") class Converter(object): def __init__(self, args): self.args = args def initializer(self): Converter.tokenizer = getattr( tfs, self.args.tokenizer_name).from_pretrained(self.args.model_name) # Split document to sentence. if self.args.split_sentences: if self.args.chinese: Converter.splitter = NewlineSplitter() else: if not nltk_available: print("NLTK is not available to split sentences.") exit() splitter = nltk.load("tokenizers/punkt/english.pickle") Converter.splitter = splitter else: Converter.splitter = IdentitySplitter() # Split sentence whole words mask for chinese if self.args.cn_whole_word_segment: if self.args.cn_splited: Converter.segment_func = lambda text: text.split(self.args.cn_split_dimer) else: Converter.segment_func = CHINESE_SEG_FUNC[ self.args.cn_seg_func] Converter.whole_word_mask = get_whole_word_mask_tokens else: Converter.segment_func = lambda x: x Converter.whole_word_mask = lambda x, y: x def process(text): words = Converter.segment_func(text) tokens = Converter.tokenizer.tokenize("".join(words)) tokens = Converter.whole_word_mask(tokens, words) tokens = Converter.tokenizer.convert_tokens_to_ids(tokens) return tokens Converter.process = process def encode(self, json_line): text = json.loads(json_line)[self.args.json_key] doc_ids = [] for sentence in Converter.splitter.tokenize(text): sentence_ids = Converter.process(sentence.strip()) if len(sentence_ids) > 0: doc_ids.append(sentence_ids) if len(doc_ids) > 0 and self.args.append_eos: doc_ids[-1].append(Converter.tokenizer.eos_token_id) return doc_ids, len(text.encode("utf-8")) def main(): args = get_args() file_paths = [] if os.path.isfile(args.input_path): file_paths.append(args.input_path) else: for root, _, fs in os.walk(args.input_path): for f in fs: file_paths.append(os.path.join(root, f)) if len(file_paths) == 0: print("No input file found!") exit(-1) convert = Converter(args) # Try tokenizer is availiable sample_tokenizer = getattr( tfs, args.tokenizer_name).from_pretrained(args.model_name) if sample_tokenizer.vocab_size < 2**16 - 1: save_dtype = np.uint16 else: save_dtype = np.int32 pool = multiprocessing.Pool(args.workers, initializer=convert.initializer) # We use BytesIO to store the ids. token_ids_stream = io.BytesIO() sentlens_stream = io.BytesIO() # # Cumsum on tokens num # sent_cumsum_stream = io.BytesIO() # sent_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True)) # Cunsum on document on every sentence num, type=np.int64 doc_cumsum_stream = io.BytesIO() doc_cumsum_stream.write((0).to_bytes(8, byteorder='little', signed=True)) sent_count = 0 # token_count = 0 file_paths.sort() step = 0 total_bytes_processed = 0 startup_start = time.time() for file_path in tqdm(file_paths): if file_path.endswith(".zst"): import zstandard cctx = zstandard.ZstdDecompressor() fh = open(file_path, 'rb') text = io.BufferedReader(cctx.stream_reader(fh)) elif file_path.endswith(".jsonl"): text = open(file_path, 'r', encoding='utf-8') else: print("Unexpected data format, skiped %s" % file_path) continue encoded_docs = pool.imap(convert.encode, text, 256) print("Processing %s" % file_path) for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): step += 1 total_bytes_processed += bytes_processed if len(doc) == 0: continue for sentence in doc: sentence_len = len(sentence) if sentence_len == 0: continue sentlens_stream.write( sentence_len.to_bytes( 4, byteorder='little', signed=True)) # token_count += sentence_len # sent_cumsum_stream.write( # token_count.to_bytes( # 8, byteorder='little', signed=True)) sent_count += 1 token_ids_stream.write( np.array( sentence, dtype=save_dtype).tobytes(order='C')) doc_cumsum_stream.write( sent_count.to_bytes( 8, byteorder='little', signed=True)) if step % args.log_interval == 0: current = time.time() elapsed = current - startup_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print( f"Processed {step} documents", f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).", file=sys.stderr) pool.close() print("Saving tokens to files...") all_doc_ids = np.frombuffer(token_ids_stream.getbuffer(), dtype=save_dtype) lens = np.frombuffer(sentlens_stream.getbuffer(), dtype=np.int32) # sents = np.frombuffer(sent_cumsum_stream.getbuffer(), dtype=np.int64) docs = np.frombuffer(doc_cumsum_stream.getbuffer(), dtype=np.int64) np.save(args.output_prefix + "_ids.npy", all_doc_ids) # np.savez(args.output_prefix + "_idx.npz", lens=lens, sents=sents, docs=docs) np.savez(args.output_prefix + "_idx.npz", lens=lens, docs=docs) print("Total sentences num: %d" % len(lens)) print("Total documents num: %d" % (len(docs) - 1)) print("Total tokens num: %d" % len(all_doc_ids)) print("Average tokens per sentence: %.2f" % (len(all_doc_ids) / len(lens))) print("Average tokens per document: %.2f" % (len(all_doc_ids) / (len(docs) - 1))) if __name__ == "__main__": main() ================================================ FILE: ppfleetx/data/data_tools/gpt/raw_trans_to_json.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import re import argparse import json import multiprocessing import sys import time import shutil from functools import partial import numpy as np from tqdm import tqdm def get_args(): parser = argparse.ArgumentParser() parser.add_argument( '--input_path', type=str, required=True, help='Path to you raw files. Folder or file path.') parser.add_argument( '--output_path', type=str, required=True, help='Path to save the output json files.') parser.add_argument( '--json_key', type=str, default='text', help='The content key of json file.') parser.add_argument( '--doc_spliter', type=str, default='', help="Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank." ) parser.add_argument( '--min_doc_length', type=int, default=10, help="Minimal char of a documment.") parser.add_argument( '--workers', type=int, default=1, help='Number of worker processes to launch') parser.add_argument( '--log_interval', type=int, default=1, help='Interval between progress updates.') parser.add_argument( '--no-merge', action='store_true', help='Don\'t merge the file.') parser.add_argument( '--no-shuffle', action='store_true', help='Don\'t shuffle the file.') args = parser.parse_args() return args def raw_text_to_json(path, doc_spliter="", json_key="text", min_doc_length=10): path = os.path.abspath(path) if not os.path.exists(path): print("No found file %s" % path) return 0, None out_filepath = path + ".jsonl" fout = open(out_filepath, "w", encoding="utf-8") len_files = 0 with open(path, "r") as f: doc = "" line = f.readline() while line: len_files += len(line) if line.strip() == doc_spliter: if len(doc) > min_doc_length: fout.write( json.dumps( { json_key: doc }, ensure_ascii=False) + "\n") doc = "" else: doc += line line = f.readline() if len(doc) > min_doc_length: fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + "\n") doc = "" return len_files, out_filepath def merge_file(file_paths, output_path): if not output_path.endswith(".jsonl"): output_path = output_path + ".jsonl" print("Merging files into %s" % output_path) with open(output_path, 'wb') as wfd: for f in file_paths: if f is not None and os.path.exists(f): with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd) os.remove(f) print("File save in %s" % output_path) return output_path def shuffle_file(output_path): print("Shuffling the jsonl file...") if os.path.exists(output_path): os.system("shuf %s -o %s" % (output_path, output_path)) print("File shuffled!!!") else: raise ValueError("File not found: %s" % output_path) def main(): args = get_args() startup_start = time.time() file_paths = [] if os.path.isfile(args.input_path): file_paths.append(args.input_path) else: for root, _, fs in os.walk(args.input_path): for f in fs: file_paths.append(os.path.join(root, f)) pool = multiprocessing.Pool(args.workers) startup_end = time.time() proc_start = time.time() total_bytes_processed = 0 print("Time to startup:", startup_end - startup_start) trans_json = partial( raw_text_to_json, doc_spliter=args.doc_spliter, json_key=args.json_key, min_doc_length=args.min_doc_length) encoded_files = pool.imap(trans_json, file_paths, 1) out_paths = [] for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1): total_bytes_processed += bytes_processed out_paths.append(out_path) master_start = time.time() if i % args.log_interval == 0: current = time.time() elapsed = current - proc_start mbs = total_bytes_processed / elapsed / 1024 / 1024 print( f"Processed {i} files", f"({i/elapsed} files/s, {mbs} MB/s).", file=sys.stderr) if not args.no_merge: output_path = merge_file(out_paths, args.output_path) if not args.no_shuffle: shuffle_file(output_path) if __name__ == "__main__": main() ================================================ FILE: ppfleetx/data/dataset/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .vision_dataset import ( GeneralClsDataset, ImageFolder, CIFAR10, ContrativeLearningDataset, ) from .multimodal_dataset import ImagenDataset from .gpt_dataset import GPTDataset, LM_Eval_Dataset, Lambada_Eval_Dataset from .glue_dataset import * from .ernie.ernie_dataset import ErnieDataset, ErnieSeqClsDataset ================================================ FILE: ppfleetx/data/dataset/ernie/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/dataset/ernie/dataset_utils.py ================================================ # coding=utf-8 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The Google AI Language Team Authors, and NVIDIA. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Most of the code here has been copied from: # https://github.com/google-research/albert/blob/master/create_pretraining_data.py # with some modifications. import math import os import re import time import collections import numpy as np import paddle def get_local_rank(): return int(os.getenv("PADDLE_RANK_IN_NODE", 0)) print_rank_0 = print # COMPILED = False # DSET_TYPE_BERT = 'standard_bert' # DSET_TYPE_T5 = 't5' # DSET_TYPE_ERNIE = 'ernie' # DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_T5, DSET_TYPE_ERNIE] def get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples): # The data prefix should be in the format of: # weight-1, data-prefix-1, weight-2, data-prefix-2, .. assert len(data_prefix) % 2 == 0 num_datasets = len(data_prefix) // 2 weights = [0] * num_datasets prefixes = [0] * num_datasets for i in range(num_datasets): weights[i] = float(data_prefix[2 * i]) prefixes[i] = (data_prefix[2 * i + 1]).strip() # Normalize weights weight_sum = 0.0 for weight in weights: weight_sum += weight assert weight_sum > 0.0 weights = [weight / weight_sum for weight in weights] # Add 0.5% (the 1.005 factor) so in case the bleding dataset does # not uniformly distribute the number of samples, we still have # samples left to feed to the network. datasets_train_valid_test_num_samples = [] for weight in weights: datasets_train_valid_test_num_samples.append([ int(math.ceil(val * weight * 1.005)) for val in train_valid_test_num_samples ]) return prefixes, weights, datasets_train_valid_test_num_samples class MMapIndexedDataset(paddle.io.Dataset): def __init__(self, path, skip_warmup=False): super().__init__() self._path = path # All documment ids, extend as 1-D array. for suffix in ["_ids.npy", "_idx.npz"]: # print(path, suffix) if not os.path.isfile(path + suffix): raise ValueError("File Not found, %s" % (path + suffix)) self._token_ids = np.load( path + "_ids.npy", mmap_mode="r", allow_pickle=True) process_data = np.load(path + "_idx.npz") self._sizes = process_data["lens"] self._pointers = np.empty(len(self._sizes) + 1, dtype=np.int64) self._pointers[0] = 0 np.cumsum(self._sizes, out=self._pointers[1:]) self._doc_idx = process_data["docs"] def __getstate__(self): return self._path def __len__(self): return len(self._sizes) # @lru_cache(maxsize=8) def __getitem__(self, idx): if isinstance(idx, int): size = self._sizes[idx] ptr = self._pointers[idx] np_array = self._token_ids[ptr:ptr + size] return np_array elif isinstance(idx, slice): start, stop, step = idx.indices(len(self)) if step != 1: raise ValueError( "Slices into indexed_dataset must be contiguous") ptr = self._pointers[start] sizes = self._sizes[idx] offsets = list(accumulate(sizes)) total_size = sum(sizes) np_array = self._token_ids[ptr:ptr + total_size] sents = np.split(np_array, offsets[:-1]) return sents def get(self, idx, offset=0, length=None): """ Retrieves a single item from the dataset with the option to only return a portion of the item. get(idx) is the same as [idx] but get() does not support slicing. """ size = self._sizes[idx] ptr = self._pointers[idx] if length is None: length = size - offset ptr += offset np_array = self._token_ids[ptr:prt + length] return np_array @property def sizes(self): return self._sizes @property def doc_idx(self): return self._doc_idx def get_doc_idx(self): return self._doc_idx def set_doc_idx(self, doc_idx_): self._doc_idx = doc_idx_ def make_indexed_dataset(data_prefix, data_impl=None, skip_warmup=False): return MMapIndexedDataset(data_prefix) def get_a_and_b_segments(sample, np_rng): """Divide sample into a and b segments.""" # Number of sentences in the sample. n_sentences = len(sample) # Make sure we always have two sentences. assert n_sentences > 1, 'make sure each sample has at least two sentences.' # First part: # `a_end` is how many sentences go into the `A`. a_end = 1 if n_sentences >= 3: # Note that randin in numpy is exclusive. a_end = np_rng.randint(1, n_sentences) tokens_a = [] for j in range(a_end): tokens_a.extend(sample[j]) # Second part: tokens_b = [] for j in range(a_end, n_sentences): tokens_b.extend(sample[j]) # Random next: is_next_random = False if np_rng.random() < 0.5: is_next_random = True tokens_a, tokens_b = tokens_b, tokens_a return tokens_a, tokens_b, is_next_random def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng): """Truncates a pair of sequences to a maximum sequence length.""" #print(len_a, len_b, max_num_tokens) assert len_a > 0 if len_a + len_b <= max_num_tokens: return False while len_a + len_b > max_num_tokens: if len_a > len_b: len_a -= 1 tokens = tokens_a else: len_b -= 1 tokens = tokens_b if np_rng.random() < 0.5: del tokens[0] else: tokens.pop() return True def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): """Merge segments A and B, add [CLS] and [SEP] and build tokentypes.""" tokens = [] tokentypes = [] # [CLS]. tokens.append(cls_id) tokentypes.append(0) # Segment A. for token in tokens_a: tokens.append(token) tokentypes.append(0) # [SEP]. tokens.append(sep_id) tokentypes.append(0) # Segment B. for token in tokens_b: tokens.append(token) tokentypes.append(1) if tokens_b: # [SEP]. tokens.append(sep_id) tokentypes.append(1) return tokens, tokentypes MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"]) def is_start_piece(piece): """Check if the current word piece is the starting piece (BERT).""" # When a word has been split into # WordPieces, the first token does not have any marker and any subsequence # tokens are prefixed with ##. So whenever we see the ## token, we # append it to the previous set of word indexes. return not piece.startswith("##") def create_masked_lm_predictions(tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=3, vocab_token_to_id_dict=None, do_whole_word_mask=True, favor_longer_ngram=False, do_permutation=False, geometric_dist=False, to_chinese_char=False, inplace_random_mask=False, masking_style="bert"): """Creates the predictions for the masked LM objective. Note: Tokens here are vocab ids and not text tokens.""" cand_indexes = [] # Note(mingdachen): We create a list for recording if the piece is # the starting piece of current token, where 1 means true, so that # on-the-fly whole word masking is possible. token_boundary = [0] * len(tokens) for (i, token) in enumerate(tokens): if token == cls_id or token == sep_id: token_boundary[i] = 1 continue # Whole Word Masking means that if we mask all of the wordpieces # corresponding to an original word. # # Note that Whole Word Masking does *not* change the training code # at all -- we still predict each WordPiece independently, softmaxed # over the entire vocabulary. vocab_id = vocab_id_to_token_dict[token] if (do_whole_word_mask and len(cand_indexes) >= 1 and not is_start_piece(vocab_id)): cand_indexes[-1].append(i) else: cand_indexes.append([i]) if is_start_piece(vocab_id_to_token_dict[token]): token_boundary[i] = 1 if to_chinese_char: # set ## chinse char to original chinese char char_tokens = [] assert vocab_token_to_id_dict is not None for i, b in enumerate(token_boundary): if b == 0: vocab_id = vocab_id_to_token_dict[tokens[i]] new_vocab_id = vocab_id[2:] if len( re.findall('##[\u4E00-\u9FA5]', vocab_id)) > 0 else vocab_id char_tokens.append(vocab_token_to_id_dict[new_vocab_id] if new_vocab_id in vocab_token_to_id_dict else token) else: char_tokens.append(tokens[i]) output_tokens = list(char_tokens) else: output_tokens = list(tokens) masked_lm_positions = [] masked_lm_labels = [] if masked_lm_prob == 0: return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary) # NOTE(shenliang03): to avoid num_to_predict < 1 num_to_predict = max(1, min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))) ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) if not geometric_dist: # Note(mingdachen): # By default, we set the probilities to favor shorter ngram sequences. pvals = 1. / np.arange(1, max_ngrams + 1) pvals /= pvals.sum(keepdims=True) if favor_longer_ngram: pvals = pvals[::-1] ngram_indexes = [] for idx in range(len(cand_indexes)): ngram_index = [] for n in ngrams: ngram_index.append(cand_indexes[idx:idx + n]) ngram_indexes.append(ngram_index) np_rng.shuffle(ngram_indexes) (masked_lms, masked_spans) = ([], []) covered_indexes = set() backup_output_tokens = list(output_tokens) for cand_index_set in ngram_indexes: if len(masked_lms) >= num_to_predict: break if not cand_index_set: continue # Note(mingdachen): # Skip current piece if they are covered in lm masking or previous ngrams. for index_set in cand_index_set[0]: for index in index_set: if index in covered_indexes: continue if not geometric_dist: n = np_rng.choice( ngrams[:len(cand_index_set)], p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True)) else: # Sampling "n" from the geometric distribution and clipping it to # the max_ngrams. Using p=0.2 default from the SpanBERT paper # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) n = min(np_rng.geometric(0.2), max_ngrams) index_set = sum(cand_index_set[n - 1], []) n -= 1 # Note(mingdachen): # Repeatedly looking for a candidate that does not exceed the # maximum number of predictions by trying shorter ngrams. while len(masked_lms) + len(index_set) > num_to_predict: if n == 0: break index_set = sum(cand_index_set[n - 1], []) n -= 1 # If adding a whole-word mask would exceed the maximum number of # predictions, then just skip this candidate. if len(masked_lms) + len(index_set) > num_to_predict: continue is_any_index_covered = False for index in index_set: if index in covered_indexes: is_any_index_covered = True break if is_any_index_covered: continue for index in index_set: covered_indexes.add(index) masked_token = None if masking_style == "bert": # 80% of the time, replace with [MASK] if np_rng.random() < 0.8: masked_token = mask_id else: # 10% of the time, keep original if np_rng.random() < 0.5: masked_token = output_tokens[index] # 10% of the time, replace with random word else: if inplace_random_mask: masked_token = backup_output_tokens[np_rng.randint( 0, len(output_tokens))] else: masked_token = vocab_id_list[np_rng.randint( 0, len(vocab_id_list))] elif masking_style == "t5": masked_token = mask_id else: raise ValueError("invalid value of masking style") output_tokens[index] = masked_token masked_lms.append( MaskedLmInstance( index=index, label=backup_output_tokens[index])) masked_spans.append( MaskedLmInstance( index=index_set, label=[backup_output_tokens[index] for index in index_set])) assert len(masked_lms) <= num_to_predict np_rng.shuffle(ngram_indexes) select_indexes = set() if do_permutation: for cand_index_set in ngram_indexes: if len(select_indexes) >= num_to_predict: break if not cand_index_set: continue # Note(mingdachen): # Skip current piece if they are covered in lm masking or previous ngrams. for index_set in cand_index_set[0]: for index in index_set: if index in covered_indexes or index in select_indexes: continue n = np.random.choice( ngrams[:len(cand_index_set)], p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True)) index_set = sum(cand_index_set[n - 1], []) n -= 1 while len(select_indexes) + len(index_set) > num_to_predict: if n == 0: break index_set = sum(cand_index_set[n - 1], []) n -= 1 # If adding a whole-word mask would exceed the maximum number of # predictions, then just skip this candidate. if len(select_indexes) + len(index_set) > num_to_predict: continue is_any_index_covered = False for index in index_set: if index in covered_indexes or index in select_indexes: is_any_index_covered = True break if is_any_index_covered: continue for index in index_set: select_indexes.add(index) assert len(select_indexes) <= num_to_predict select_indexes = sorted(select_indexes) permute_indexes = list(select_indexes) np_rng.shuffle(permute_indexes) orig_token = list(output_tokens) for src_i, tgt_i in zip(select_indexes, permute_indexes): output_tokens[src_i] = orig_token[tgt_i] masked_lms.append( MaskedLmInstance( index=src_i, label=orig_token[src_i])) masked_lms = sorted(masked_lms, key=lambda x: x.index) # Sort the spans by the index of the first span masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) for p in masked_lms: masked_lm_positions.append(p.index) masked_lm_labels.append(p.label) return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans) def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" # Some checks. num_tokens = len(tokens) padding_length = max_seq_length - num_tokens assert padding_length >= 0 assert len(tokentypes) == num_tokens assert len(masked_positions) == len(masked_labels) # Tokens and token types. filler = [pad_id] * padding_length tokens_np = np.array(tokens + filler, dtype=np.int64) tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) # Padding mask. padding_mask_np = np.array( [1] * num_tokens + [0] * padding_length, dtype=np.int64) # Lables and loss mask. labels = [-1] * max_seq_length loss_mask = [0] * max_seq_length for i in range(len(masked_positions)): assert masked_positions[i] < num_tokens labels[masked_positions[i]] = masked_labels[i] loss_mask[masked_positions[i]] = 1 labels_np = np.array(labels, dtype=np.int64) loss_mask_np = np.array(loss_mask, dtype=np.int64) return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): print_rank_0(' > building dataset index ...') start_time = time.time() indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1] print_rank_0(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time)) print_rank_0(' > indexed dataset stats:') print_rank_0(' number of documents: {}'.format( indexed_dataset.doc_idx.shape[0] - 1)) print_rank_0(' number of sentences: {}'.format( indexed_dataset.sizes.shape[0])) return indexed_dataset def get_train_valid_test_split_(splits_string, size): """ Get dataset splits from comma or '/' separated string list.""" splits = [] if splits_string.find(',') != -1: splits = [float(s) for s in splits_string.split(',')] elif splits_string.find('/') != -1: splits = [float(s) for s in splits_string.split('/')] else: splits = [float(splits_string)] while len(splits) < 3: splits.append(0.) splits = splits[:3] splits_sum = sum(splits) assert splits_sum > 0.0 splits = [split / splits_sum for split in splits] splits_index = [0] for index, split in enumerate(splits): splits_index.append(splits_index[index] + int( round(split * float(size)))) diff = splits_index[-1] - size for index in range(1, len(splits_index)): splits_index[index] -= diff assert len(splits_index) == 4 assert splits_index[-1] == size return splits_index def get_samples_mapping(indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, name, binary_head, share_folder): """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' local_rank = get_local_rank() if share_folder: local_rank = paddle.distributed.get_rank() # Build the indexed mapping if not exist. if local_rank == 0 and \ not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 print(indexed_dataset.sizes.dtype) assert indexed_dataset.sizes.dtype == np.int32 try: import ppfleetx.data.data_tools.cpp.fast_index_map_helpers as ernie_fast_index_map_helpers except Exception as e: start_time = time.time() print('> compiling dataset index builder ...') from ppfleetx.data.data_tools.cpp.compile import compile_helper compile_helper() print( '>>> done with dataset index builder. Compilation time: {:.3f} ' 'seconds'.format(time.time() - start_time), flush=True) import ppfleetx.data.data_tools.cpp.fast_index_map_helpers as ernie_fast_index_map_helpers samples_mapping = ernie_fast_index_map_helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, True, 2 if binary_head else 1) print_rank_0(' > done building sapmles index maping') start_time = time.time() np.save(indexmap_filename, samples_mapping, allow_pickle=True) print_rank_0(' > saved the index mapping in {}'.format( indexmap_filename)) # Make sure all the ranks have built the mapping print_rank_0(' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) else: while True: if (not os.path.isfile(indexmap_filename)): time.sleep(3) else: try: np.load( indexmap_filename, allow_pickle=True, mmap_mode='r') break except Exception as e: print( "%s file is still writing or damaged, please wait a moment." % indexmap_filename) time.sleep(3) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case if paddle.distributed.get_world_size() > 1: if paddle.in_dynamic_mode(): paddle.distributed.barrier() # Load indexed dataset. print_rank_0(' > loading indexed mapping from {}'.format( indexmap_filename)) start_time = time.time() samples_mapping = np.load( indexmap_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(time.time( ) - start_time)) print_rank_0(' total number of samples: {}'.format( samples_mapping.shape[0])) return samples_mapping ================================================ FILE: ppfleetx/data/dataset/ernie/ernie_dataset.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import time import numpy as np import re import copy from functools import partial import paddle from .dataset_utils import ( get_samples_mapping, get_a_and_b_segments, truncate_segments, create_tokens_and_tokentypes, create_masked_lm_predictions, make_indexed_dataset, get_indexed_dataset_, ) from paddlenlp.transformers import ErnieTokenizer from paddlenlp.datasets.dataset import MapDataset, IterableDataset, SimpleBuilder, load_dataset def get_local_rank(): return int(os.getenv("PADDLE_RANK_IN_NODE", 0)) print_rank_0 = print mode_to_index = {"Train": 0, "Eval": 1, "Test": 2} mode_to_key = {"Train": "train", "Eval": "dev", "Test": "test"} class ErnieDataset(paddle.io.Dataset): def __init__(self, input_dir, tokenizer_type, split, num_samples, mode, max_seq_length, masked_lm_prob, short_seq_prob, seed, binary_head, share_folder, favor_longer_ngram, max_ngrams): tokenizer = ErnieTokenizer.from_pretrained(tokenizer_type) tokenizer.extend_chinese_char() files = get_train_data_file(input_dir)[0] skip_warmup = True indexed_dataset = get_indexed_dataset_(files, None, skip_warmup) total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1 splits = get_train_valid_test_split_(split, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') def print_split_stats(name, index): print_rank_0(' {}:'.format(name)) print_rank_0(' document indices in [{}, {}) total of {} ' 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])) start_index = indexed_dataset.doc_idx[splits[index]] end_index = indexed_dataset.doc_idx[splits[index + 1]] print_rank_0(' sentence indices in [{}, {}) total of {} ' 'sentences'.format(start_index, end_index, end_index - start_index)) index = mode_to_index[mode] print_split_stats(mode, index) # dataset = None assert splits[index + 1] > splits[index] # Get the pointer to the original doc-idx so we can set it later. doc_idx_ptr = indexed_dataset.get_doc_idx() # Slice the doc-idx start_index = splits[index] # Add +1 so we can index into the dataset to get the upper bound. end_index = splits[index + 1] + 1 # New doc_idx view. indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index]) # Build the dataset accordingly. self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.binary_head = binary_head self.share_folder = share_folder self.indexed_dataset = indexed_dataset self.favor_longer_ngram = favor_longer_ngram self.max_ngrams = max_ngrams # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, files, None, num_samples, self.max_seq_length - 3, # account for added tokens short_seq_prob, self.seed, mode, self.binary_head, self.share_folder) self.vocab_id_list = list(tokenizer.vocab.idx_to_token.keys()) self.vocab_id_to_token_dict = copy.deepcopy( tokenizer.vocab.idx_to_token) self.vocab_token_to_id_dict = copy.deepcopy( tokenizer.vocab.token_to_idx) # ERNIE is chinse char level model, sometime is need # add ## chinse char to encode and decode. # Here we extend the vocab dict. self.vocab_id_to_token_dict.update(tokenizer.added_tokens_decoder) self.vocab_token_to_id_dict.update(tokenizer.added_tokens_encoder) self.cls_id = tokenizer.cls_token_id self.sep_id = tokenizer.sep_token_id self.mask_id = tokenizer.mask_token_id self.pad_id = tokenizer.pad_token_id def __len__(self): return self.samples_mapping.shape[0] def __getitem__(self, idx): start_idx, end_idx, seq_length = self.samples_mapping[idx] sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)] # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32)) return build_training_sample( sample, seq_length, self.max_seq_length, # needed for padding self.vocab_id_list, self.vocab_id_to_token_dict, self.vocab_token_to_id_dict, self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.binary_head, self.favor_longer_ngram, self.max_ngrams) def build_training_sample(sample, target_seq_length, max_seq_length, vocab_id_list, vocab_id_to_token_dict, vocab_token_to_id_dict, cls_id, sep_id, mask_id, pad_id, masked_lm_prob, np_rng, binary_head, favor_longer_ngram=False, max_ngrams=3): """Biuld training sample. Arguments: sample: A list of sentences in which each sentence is a list token ids. target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. vocab_token_to_id_dict: A dictionary from text tokens to vocab ids. cls_id: Start of example id. sep_id: Separator id. mask_id: Mask token id. pad_id: Padding token id. masked_lm_prob: Probability to mask tokens. np_rng: Random number genenrator. Note that this rng state should be numpy and not python since python randint is inclusive for the opper bound whereas the numpy one is exclusive. """ if binary_head: # We assume that we have at least two sentences in the sample assert len(sample) > 1, "The sentence num should be large than 1." assert target_seq_length <= max_seq_length # Divide sample into two segments (A and B). if binary_head: tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, np_rng) else: tokens_a = [] for j in range(len(sample)): tokens_a.extend(sample[j]) tokens_b = [] is_next_random = False # Truncate to `target_sequence_length`. max_num_tokens = target_seq_length truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), max_num_tokens, np_rng) # Build tokens and toketypes. tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id) # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions( tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, vocab_token_to_id_dict=vocab_token_to_id_dict, to_chinese_char=True, inplace_random_mask=False, favor_longer_ngram=favor_longer_ngram, max_ngrams=max_ngrams, ) # Padding. tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \ = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length) return tokens_np, tokentypes_np, padding_mask_np, masked_positions, masked_labels, int( is_next_random) def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" # Some checks. num_tokens = len(tokens) padding_length = max_seq_length - num_tokens assert padding_length >= 0 assert len(tokentypes) == num_tokens assert len(masked_positions) == len(masked_labels) # Tokens and token types. filler = [pad_id] * padding_length tokens_np = np.array(tokens + filler, dtype=np.int64) tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) # Padding mask. padding_mask_np = np.array( [1] * num_tokens + [0] * padding_length, dtype=np.float32) padding_mask_np = (1 - padding_mask_np) * -1e4 padding_mask_np = padding_mask_np.reshape([1, 1, -1]) # Lables and loss mask. labels = [-1] * max_seq_length loss_mask = [0] * max_seq_length for i in range(len(masked_positions)): assert masked_positions[i] < num_tokens labels[masked_positions[i]] = masked_labels[i] loss_mask[masked_positions[i]] = 1 labels_np = np.array(labels, dtype=np.int64) loss_mask_np = np.array(loss_mask, dtype=np.int64) return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np def get_train_data_file(input_dir): if len(input_dir.split()) > 1: # weight-1 data-prefix-1 weight-2 data-prefix-2 ... return input_dir.split() else: files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if (os.path.isfile(os.path.join(input_dir, f)) and "_idx.npz" in str(f)) ] files = [x.replace("_idx.npz", "") for x in files] if len(files) > 1: ret = [] logger.info("You are using multi-dataset:") for x in files: ret.append(1.0) ret.append(x) logger.info(" > set weight of %s dataset to 1.0" % x) return ret return files def get_train_valid_test_split_(splits, size): """ Get dataset splits from comma or '/' separated string list. """ splits = [float(s) for s in splits] while len(splits) < 3: splits.append(0.) splits = splits[:3] splits_sum = sum(splits) assert splits_sum > 0.0 splits = [split / splits_sum for split in splits] splits_index = [0] for index, split in enumerate(splits): splits_index.append(splits_index[index] + int( round(split * float(size)))) diff = splits_index[-1] - size for index in range(1, len(splits_index)): splits_index[index] -= diff assert len(splits_index) == 4 assert splits_index[-1] == size return splits_index class ErnieSeqClsDataset(paddle.io.Dataset): def __init__(self, dataset_type, tokenizer_type, max_seq_len, mode): self.dataset = dataset_type self.max_seq_len = max_seq_len self.mode = mode_to_key[mode] from ppfleetx.data.tokenizers import get_ernie_tokenizer self.tokenizer = get_ernie_tokenizer(tokenizer_type) dataset_config = self.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) self.label_list = getattr(raw_datasets['train'], "label_list", None) # Define dataset pre-process function if "clue" in self.dataset: trans_fn = partial(self._clue_trans_fn) else: trans_fn = partial(self._seq_trans_fn) self.seqcls_dataset = raw_datasets[self.mode].map(trans_fn) def __getitem__(self, idx): return self.seqcls_dataset.__getitem__(idx) def __len__(self): return self.seqcls_dataset.__len__() def _seq_trans_fn(self, example): return self._convert_example( example, tokenizer=self.tokenizer, max_seq_length=self.max_seq_len, ) def _clue_trans_fn(self, example): return self._convert_clue( example, label_list=self.label_list, tokenizer=self.tokenizer, max_seq_length=self.max_seq_len, ) def _convert_example(self, example, tokenizer, max_seq_length=512, is_test=False): is_test = True if 'label' in example.keys(): is_test = False if "text_b" in example.keys(): text = example["text_a"] text_pair = example["text_b"] else: text = example["text"] text_pair = None encoded_inputs = tokenizer( text=text, text_pair=text_pair, max_seq_len=max_seq_length) input_ids = encoded_inputs["input_ids"] token_type_ids = encoded_inputs["token_type_ids"] if is_test: return { "input_ids": input_ids, "token_type_ids": token_type_ids, } else: # label = np.array([example["label"]], dtype="int64") label = int(example["label"]) return { "input_ids": input_ids, "token_type_ids": token_type_ids, "labels": label } # Data pre-process function for clue benchmark datatset def _convert_clue(self, example, label_list, tokenizer=None, max_seq_length=512, **kwargs): """convert a glue example into necessary features""" is_test = False if 'label' not in example.keys(): is_test = True if not is_test: # `label_list == None` is for regression task label_dtype = "int64" if label_list else "float32" # Get the label example['label'] = int(example[ "label"]) if label_dtype != "float32" else float(example[ "label"]) label = example['label'] # Convert raw text to feature if 'keyword' in example: # CSL sentence1 = " ".join(example['keyword']) example = { 'sentence1': sentence1, 'sentence2': example['abst'], 'label': example['label'] } elif 'target' in example: # wsc text, query, pronoun, query_idx, pronoun_idx = example[ 'text'], example['target']['span1_text'], example['target'][ 'span2_text'], example['target']['span1_index'], example[ 'target']['span2_index'] text_list = list(text) assert text[pronoun_idx:(pronoun_idx + len( pronoun))] == pronoun, "pronoun: {}".format(pronoun) assert text[query_idx:(query_idx + len(query) )] == query, "query: {}".format(query) if pronoun_idx > query_idx: text_list.insert(query_idx, "_") text_list.insert(query_idx + len(query) + 1, "_") text_list.insert(pronoun_idx + 2, "[") text_list.insert(pronoun_idx + len(pronoun) + 2 + 1, "]") else: text_list.insert(pronoun_idx, "[") text_list.insert(pronoun_idx + len(pronoun) + 1, "]") text_list.insert(query_idx + 2, "_") text_list.insert(query_idx + len(query) + 2 + 1, "_") text = "".join(text_list) example['sentence'] = text if tokenizer is None: return example if 'sentence' in example: example = tokenizer( example['sentence'], max_seq_len=max_seq_length) elif 'sentence1' in example: example = tokenizer( example['sentence1'], text_pair=example['sentence2'], max_seq_len=max_seq_length) if not is_test: if "token_type_ids" in example: return { "input_ids": example['input_ids'], "token_type_ids": example['token_type_ids'], "labels": label } else: return {"input_ids": example['input_ids'], "labels": label} else: return { "input_ids": example['input_ids'], "token_type_ids": example['token_type_ids'] } ================================================ FILE: ppfleetx/data/dataset/glue_dataset.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import numpy as np import paddle from ppfleetx.data.tokenizers import GPTTokenizer from ppfleetx.utils.download import cached_path from ppfleetx.utils.file import unzip, parse_csv __all__ = [ 'CoLA', 'SST2', 'MNLI', 'QNLI', 'RTE', 'WNLI', 'MRPC', 'QQP', 'STSB' ] """ Single-Sentence Tasks: * CoLA * SST-2 Similarity and Paraphrase Tasks: * MRPC * STS-B * QQP Inference Tasks: * MNLI * QNLI * RTE * WNLI """ class CoLA(paddle.io.Dataset): """The Corpus of Linguistic Acceptability consists of English acceptability judgments drawn from books and journal articles on linguistic theory. Each example is a sequence of words annotated with whether it is a grammatical English sentence.""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/cola.html#CoLA URL = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip" MD5 = "9f6d88c3558ec424cd9d66ea03589aba" NUM_LINES = { "train": 8551, "dev": 527, "test": 516, } _PATH = "cola_public_1.1.zip" DATASET_NAME = "CoLA" _EXTRACTED_FILES = { "train": os.path.join("raw", "in_domain_train.tsv"), "dev": os.path.join("raw", "in_domain_dev.tsv"), "test": os.path.join("raw", "out_of_domain_dev.tsv"), } def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] def _filter_res(x): return len(x) == 4 def _modify_res(x): return (x[3], int(x[1])) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res, filter_funcs=_filter_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': return input_ids, sample[1] else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 class SST2(paddle.io.Dataset): """The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. We use the two-way (positive/negative) class split, and use only sentence-level labels.""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/sst2.html#SST2 URL = "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip" MD5 = "9f81648d4199384278b86e315dac217c" NUM_LINES = { "train": 67349, "dev": 872, "test": 1821, } _PATH = "SST-2.zip" DATASET_NAME = "SST2" _EXTRACTED_FILES = { "train": "train.tsv", "dev": "dev.tsv", "test": "test.tsv", } def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] # test split for SST2 doesn't have labels if split == "test": def _modify_test_res(t): return (t[1].strip(), ) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_test_res) else: def _modify_res(t): return (t[0].strip(), int(t[1])) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': return input_ids, sample[1] else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 class MNLI(paddle.io.Dataset): """The Multi-Genre Natural Language Inference Corpus is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are gathered from ten different sources, including transcribed speech, fiction, and government reports. We use the standard test set, for which we obtained private labels from the authors, and evaluate on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend the SNLI corpus as 550k examples of auxiliary training data.""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/mnli.html#MNLI URL = "https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip" MD5 = "0f70aaf66293b3c088a864891db51353" NUM_LINES = { "train": 392702, "dev_matched": 9815, "dev_mismatched": 9832, } _PATH = "multinli_1.0.zip" DATASET_NAME = "MNLI" _EXTRACTED_FILES = { "train": "multinli_1.0_train.txt", "dev_matched": "multinli_1.0_dev_matched.txt", "dev_mismatched": "multinli_1.0_dev_mismatched.txt", } LABEL_TO_INT = {"entailment": 0, "neutral": 1, "contradiction": 2} def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev_matched', 'dev_mismatched'] def _filter_res(x): return x[0] in self.LABEL_TO_INT def _modify_res(x): return (x[5], x[6], self.LABEL_TO_INT[x[0]]) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res, filter_funcs=_filter_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) return input_ids, sample[2] def __len__(self): return len(self.samples) @property def class_num(self): return 3 class QNLI(paddle.io.Dataset): """The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question (written by an annotator). We convert the task into sentence pair classification by forming a pair between each question and each sentence in the corresponding context, and filtering out pairs with low lexical overlap between the question and the context sentence. The task is to determine whether the context sentence contains the answer to the question. This modified version of the original task removes the requirement that the model select the exact answer, but also removes the simplifying assumptions that the answer is always present in the input and that lexical overlap is a reliable cue.""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/qnli.html#QNLI URL = "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip" MD5 = "b4efd6554440de1712e9b54e14760e82" NUM_LINES = { "train": 104743, "dev": 5463, "test": 5463, } _PATH = "QNLIv2.zip" DATASET_NAME = "QNLI" _EXTRACTED_FILES = { "train": "train.tsv", "dev": "dev.tsv", "test": "test.tsv", } MAP_LABELS = {"entailment": 0, "not_entailment": 1} def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] def _modify_res(x): if split == 'test': # test split for QNLI doesn't have labels return (x[1], x[2]) else: return (x[1], x[2], self.MAP_LABELS[x[3]]) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': return input_ids, sample[2] else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 class RTE(paddle.io.Dataset): """The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where for three-class datasets we collapse neutral and contradiction into not entailment, for consistency.""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/rte.html#RTE URL = "https://dl.fbaipublicfiles.com/glue/data/RTE.zip" MD5 = "bef554d0cafd4ab6743488101c638539" NUM_LINES = { "train": 67349, "dev": 872, "test": 1821, } _PATH = "RTE.zip" DATASET_NAME = "RTE" _EXTRACTED_FILES = { "train": "train.tsv", "dev": "dev.tsv", "test": "test.tsv", } MAP_LABELS = {"entailment": 0, "not_entailment": 1} def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] def _modify_res(x): if split == 'test': # test split for RTE doesn't have labels return (x[1], x[2]) else: return (x[1], x[2], self.MAP_LABELS[x[3]]) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': return input_ids, sample[2] else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 class WNLI(paddle.io.Dataset): """The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices. The examples are manually constructed to foil simple statistical methods: Each one is contingent on contextual information provided by a single word or phrase in the sentence. To convert the problem into sentence pair classification, we construct sentence pairs by replacing the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of new examples derived from fiction books that was shared privately by the authors of the original corpus. While the included training set is balanced between two classes, the test set is imbalanced between them (65% not entailment). Also, due to a data quirk, the development set is adversarial: hypotheses are sometimes shared between training and development examples, so if a model memorizes the training examples, they will predict the wrong label on corresponding development set example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence between a model's score on this task and its score on the unconverted original task. We call converted dataset WNLI (Winograd NLI).""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/wnli.html#WNLI URL = "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip" MD5 = "a1b4bd2861017d302d29e42139657a42" NUM_LINES = { "train": 635, "dev": 71, "test": 146, } _PATH = "WNLI.zip" DATASET_NAME = "WNLI" _EXTRACTED_FILES = { "train": "train.tsv", "dev": "dev.tsv", "test": "test.tsv", } def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] def _modify_res(x): if split == 'test': # test split for WNLI doesn't have labels return (x[1], x[2]) else: return (x[1], x[2], int(x[3])) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': return input_ids, sample[2] else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 class MRPC(paddle.io.Dataset): """The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.""" # ref https://pytorch.org/text/stable/_modules/torchtext/datasets/mrpc.html#MRPC URL = { "train": "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt", "test": "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt", } MD5 = { "train": "793daf7b6224281e75fe61c1f80afe35", "test": "e437fdddb92535b820fe8852e2df8a49", } NUM_LINES = { "train": 4076, "test": 1725, } DATASET_NAME = "MRPC" _EXTRACTED_FILES = { "train": "msr_paraphrase_train.txt", "test": "msr_paraphrase_test.txt", } def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) cached_path(self.URL[split], cache_dir=os.path.abspath(self.root)) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'test'] def _modify_res(x): return (x[3], x[4], int(x[0])) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) return input_ids, sample[2] def __len__(self): return len(self.samples) @property def class_num(self): return 2 class QQP(paddle.io.Dataset): """The Quora Question Pairs2 dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.""" # ref https://huggingface.co/datasets/glue/blob/main/glue.py#L212-L239 URL = "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip" MD5 = "884bf26e39c783d757acc510a2a516ef" NUM_LINES = { "train": 363846, "dev": 40430, "test": 390961, } _PATH = "QQP-clean.zip" DATASET_NAME = "QQP" _EXTRACTED_FILES = { "train": "train.tsv", "dev": "dev.tsv", "test": "test.tsv", } MAP_LABELS = {"not_duplicate": 0, "duplicate": 1} def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] def _modify_res(x): if split == 'test': # test split for QQP doesn't have labels return (x[1], x[2]) else: return (x[3], x[4], int(x[5])) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': return input_ids, sample[2] else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 class STSB(paddle.io.Dataset): """The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of sentence pairs drawn from news headlines, video and image captions, and natural language inference data. Each pair is human-annotated with a similarity score from 1 to 5.""" # ref https://huggingface.co/datasets/glue/blob/main/glue.py#L240-L267 URL = "https://dl.fbaipublicfiles.com/glue/data/STS-B.zip" MD5 = "d573676be38f1a075a5702b90ceab3de" NUM_LINES = { "train": 5749, "dev": 1500, "test": 1379, } _PATH = "STS-B.zip" DATASET_NAME = "STSB" _EXTRACTED_FILES = { "train": "train.tsv", "dev": "dev.tsv", "test": "test.tsv", } def __init__(self, root, split, max_length=128): self.root = root self.split = split if os.path.exists(self.root): assert os.path.isdir(self.root) else: zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) unzip( zip_path, mode="r", out_dir=os.path.join(self.root, '..'), delete=True) self.path = os.path.join(self.root, self._EXTRACTED_FILES[split]) assert os.path.exists(self.path), f"{self.path} is not exists!" self.max_length = max_length self.tokenizer = GPTTokenizer.from_pretrained("gpt2") assert split in ['train', 'dev', 'test'] def _modify_res(x): if split == 'test': # test split for STSB doesn't have labels return (x[7], x[8]) else: return (x[7], x[8], float(x[9])) self.samples = parse_csv( self.path, skip_lines=1, delimiter="\t", map_funcs=_modify_res) def __getitem__(self, idx): sample = self.samples[idx] encoded_inputs = self.tokenizer( sample[0], text_pair=sample[1], padding="max_length", truncation="longest_first", max_length=self.max_length, return_token_type_ids=False) input_ids = encoded_inputs['input_ids'] input_ids = paddle.to_tensor(input_ids) if self.split != 'test': # Note(GuoxiaWang): We need return shape [1] value, # so that we can attain a batched label with shape [batchsize, 1]. # Because the logits shape is [batchsize, 1], and feed into MSE loss. return input_ids, np.array([sample[2]], dtype=np.float32) else: return input_ids def __len__(self): return len(self.samples) @property def class_num(self): return 2 ================================================ FILE: ppfleetx/data/dataset/gpt_dataset.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import time import numpy as np import re import math import json import paddle from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger from ppfleetx.data.tokenizers import GPTTokenizer # TODO(haohongxiang): to solve the problem of cross-reference import paddlenlp from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer mode_to_index = {"Train": 0, "Eval": 1, "Test": 2} MODEL_CLASSES = { "GPT": (GPTTokenizer, "gpt2"), "MoE": (GPTTokenizer, "gpt2"), "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"), } class GPTDataset(paddle.io.Dataset): def __init__(self, input_dir, split, max_seq_len, num_samples, mode, model_type="GPT", seed=1234): files = get_train_data_file(input_dir) files.sort() input_dir = [files[0]] local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) if local_rank == 0: try: import ppfleetx.data.data_tools.cpp.fast_index_map_helpers except Exception as e: start_time = time.time() print('> compiling dataset index builder ...') from ppfleetx.data.data_tools.cpp.compile import compile_helper compile_helper() print( '>>> done with dataset index builder. Compilation time: {:.3f} ' 'seconds'.format(time.time() - start_time), flush=True) device_world_size = paddle.distributed.get_world_size() if device_world_size > 1 and local_rank != 0: while True: try: import ppfleetx.data.data_tools.cpp.fast_index_map_helpers break except Exception as e: print("> wait for helpers to be compiled!") time.sleep(1) try: data_world_size = env.get_data_world_size() logger.info( "The distributed run, total device num:{}, distinct dataflow num:{}.". format(device_world_size, data_world_size)) except AttributeError: pass assert len(input_dir) == 1, "GPT only support one dataset for now." input_prefix = input_dir[0] if os.path.isfile(input_prefix + "_ids.npz"): logger.warning( "You are using compatible dataset, please make new dataset as the readme!" ) process_data = np.load( input_prefix + "_ids.npz", mmap_mode="r+", allow_pickle=True) sample_ids = process_data["ids"] sample_lens = process_data["lens"].astype("int32") else: for suffix in ["_ids.npy", "_idx.npz"]: if not os.path.isfile(input_prefix + suffix): raise ValueError("File Not found, %s" % (input_prefix + suffix)) sample_ids = np.load( input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True) # All documment ids, extend as 1-D array. process_data = np.load(input_prefix + "_idx.npz") # The len(sample_lens) num of docs # The sum(sample_lens) should equal len(sample_ids) sample_lens = process_data["lens"] splits = get_train_valid_test_split_(split, len(sample_lens)) assert len(sample_lens) >= splits[ -1], "The document nums should larger than max of splits, but %s < %s" % ( len(sample_lens), splits[-1]) tokenizer_class, pretrained_name = MODEL_CLASSES[model_type] tokenizer = tokenizer_class.from_pretrained(pretrained_name) self.input_dir = input_dir self.max_seq_len = max_seq_len self.mode = mode self.name = "gpt_" + mode self.eos_id = tokenizer.eos_token_id self.sample_ids = sample_ids self.sample_lens = sample_lens self.build_data_file = (local_rank == 0) if mode in mode_to_index.keys(): index = mode_to_index[mode] else: raise ValueError("valid str value for 'mode'") documents = np.arange(splits[index], splits[index + 1]) if documents is None: document_ids = np.arange(0, self.sample_lens.shape[0]) else: document_ids = documents self.doc_idx, self.sample_idx, self.shuffle_idx = \ construct_samples_and_shuffle_data(self.name, input_prefix, document_ids,\ self.sample_lens, num_samples, max_seq_len, seed, self.build_data_file) # The doc cumsum start pos self.start_pos = [0] + np.cumsum(self.sample_lens).tolist() def _construct_sample(self, tokens): tokens = np.array(tokens).astype("int64").tolist() labels = tokens[1:] tokens = tokens[:-1] seq_length = len(tokens) # Attention mask for the attention calulate # attention_mask = np.tri(seq_length, seq_length).reshape((1, seq_length, # seq_length)) # The pad and eos tokens do not contribute the loss loss_mask = np.ones(seq_length, dtype="float32") loss_mask[tokens == self.eos_id] = 0.0 position_ids = np.arange(0, seq_length, dtype="int64") labels = np.array(labels).astype("int64") tokens = np.array(tokens).astype("int64") if self.mode == "Test": return [tokens, position_ids] else: return [tokens, position_ids, labels, loss_mask] def _get_single_sample_from_idx(self, doc_index_f, doc_index_l, offset_f, offset_l): """ The input means: doc_index_f: data from the first doc. doc_index_l: data from the last doc. offset_f: offset of the first doc. offset_l: offset of the last doc. """ # Data from the sample doc. just select the needed ids. if doc_index_f == doc_index_l: current_start_pos = self.start_pos[self.doc_idx[doc_index_f]] return self.sample_ids[current_start_pos+offset_f:\ current_start_pos+offset_l+1].tolist() # Data from multi docs. else: current_start_pos = self.start_pos[self.doc_idx[doc_index_f]] next_start_pos = self.start_pos[self.doc_idx[doc_index_f] + 1] tokens = self.sample_ids[current_start_pos + offset_f: next_start_pos].tolist() for i in range(doc_index_f + 1, doc_index_l): current_start_pos = self.start_pos[self.doc_idx[i]] next_start_pos = self.start_pos[self.doc_idx[i] + 1] tokens.extend(self.sample_ids[current_start_pos:next_start_pos] .tolist()) last_start_pos = self.start_pos[self.doc_idx[doc_index_l]] tokens.extend(self.sample_ids[last_start_pos:last_start_pos + offset_l + 1].tolist()) return tokens def __getitem__(self, index): idx = self.shuffle_idx[index] # Start and end documents and offsets. doc_index_f = self.sample_idx[idx][0] doc_index_l = self.sample_idx[idx + 1][0] offset_f = self.sample_idx[idx][1] offset_l = self.sample_idx[idx + 1][1] tokens = self._get_single_sample_from_idx(doc_index_f, doc_index_l, offset_f, offset_l) return self._construct_sample(tokens) def __len__(self): return self.sample_idx.shape[0] - 1 def get_train_data_file(input_dir): files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if (os.path.isfile(os.path.join(input_dir, f)) and str(f) .endswith("_idx.npz")) ] files = [x.replace("_idx.npz", "") for x in files] if len(files) == 0: logger.warning( "Not found dataset with name of xxx_ids.npy and xxx_idx.npz! Try to found old compatible xxx_ids.npz file." ) else: return files files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if (os.path.isfile(os.path.join(input_dir, f)) and str(f) .endswith("_ids.npz")) ] files = [x.replace("_ids.npz", "") for x in files] if len(files) == 0: raise RuntimeError( "Not found dataset with name of xxx_ids.npz in given input_dir '{}'! ". format(input_dir)) else: return files def get_train_valid_test_split_(splits, size): """ Get dataset splits from comma or '/' separated string list. """ splits = [float(s) for s in splits] while len(splits) < 3: splits.append(0.) splits = splits[:3] splits_sum = sum(splits) assert splits_sum > 0.0 splits = [split / splits_sum for split in splits] splits_index = [0] for index, split in enumerate(splits): splits_index.append(splits_index[index] + int( round(split * float(size)))) diff = splits_index[-1] - size for index in range(1, len(splits_index)): splits_index[index] -= diff assert len(splits_index) == 4 assert splits_index[-1] == size return splits_index def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes, num_samples, seq_length, seed, build_data_file): """ documents: document index from 0 to len(docs) sizes: the length list of all docs. num_samples: total step*bs iterations of data. seq_length: the sequence length. sum(sizes) = tokens_per_epoch data_nums = num_samples * micro_batch_size num_epochs = (data_nums + 1) // sum(sizes) len(doc_idx) = num_epochs * sum(sizes) """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # Rng state np_rng = np.random.RandomState(seed=seed) # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) _filename += '_{}sl'.format(seq_length) doc_idx_filename = _filename + '_doc_idx.npy' sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' # Sava random state savedState = np_rng.get_state() # Build the indexed mapping if not exist. if build_data_file: if (not os.path.isfile(doc_idx_filename)) or \ (not os.path.isfile(sample_idx_filename)) or \ (not os.path.isfile(shuffle_idx_filename)): if num_epochs == 1: separate_last_epoch = False else: num_samples_from_epochs_minus_one = ( (num_epochs - 1) * tokens_per_epoch - 1) // seq_length last_epoch_num_samples = num_samples - \ num_samples_from_epochs_minus_one assert last_epoch_num_samples >= 0, \ 'last epoch number of samples should be non-negative.' num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length assert last_epoch_num_samples < (num_samples_per_epoch + 1), \ 'last epoch number of samples exceeded max value.' separate_last_epoch = ( last_epoch_num_samples < int(0.80 * num_samples_per_epoch)) # Note. len(doc_idx) = num_epochs * len(doc) start_time = time.time() doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) np.save(doc_idx_filename, doc_idx, allow_pickle=True) print(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # sample-idx. pos of each seq_len of data. start_time = time.time() assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 from ppfleetx.data.data_tools.cpp import fast_index_map_helpers sample_idx = fast_index_map_helpers.build_sample_idx( sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, # num_epochs, tokens_per_epoch) np.save(sample_idx_filename, sample_idx, allow_pickle=True) print(' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # shuffle-idx. start_time = time.time() if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 # Shuffle all seq len data. shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) print(' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) else: while True: if (not os.path.isfile(doc_idx_filename)) or \ (not os.path.isfile(sample_idx_filename)) or \ (not os.path.isfile(shuffle_idx_filename)): time.sleep(3) else: try: np.load( shuffle_idx_filename, allow_pickle=True, mmap_mode='r') break except Exception as e: print( "%s file is still writing or damaged, please wait a moment." % shuffle_idx_filename) time.sleep(3) # Restore random state np_rng.set_state(savedState) try: if paddle.distributed.get_world_size() > 1: if paddle.in_dynamic_mode(): paddle.distributed.barrier() except AssertionError: pass # Load mappings. doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') shuffle_idx = np.load( shuffle_idx_filename, allow_pickle=True, mmap_mode='r') return doc_idx, sample_idx, shuffle_idx def _num_tokens(documents, lens): """Total number of tokens in the dataset.""" return np.sum(lens[documents]) def _num_epochs(tokens_per_epoch, seq_length, num_samples): """Based on number of samples and sequence lenght, calculate how many epochs will be needed.""" num_epochs = 0 total_tokens = 0 while True: num_epochs += 1 total_tokens += tokens_per_epoch if ((total_tokens - 1) // seq_length) >= num_samples: return num_epochs def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): """ Build an array with length = number-of-epochs * number-of-documents. Each index is mapped to a corresponding document. """ if not separate_last_epoch or num_epochs == 1: doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] doc_idx[:] = documents # The documents repeat num_epochs times. doc_idx = doc_idx.reshape(-1) doc_idx = doc_idx.astype(np.int32) np_rng.shuffle(doc_idx) return doc_idx doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False) doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) return np.concatenate((doc_idx_first, doc_idx_last)) def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): """ num_samples + 1, pos of bs data the distance between two points for sample idx is bs tokens. """ num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length sample_idx = np.zeros([int(num_samples) + 1, 2], dtype=np.int32) sample_index = 0 doc_idx_index = 0 doc_offset = 0 sample_idx[sample_index][0] = doc_idx_index sample_idx[sample_index][1] = doc_offset sample_index += 1 while sample_index <= num_samples: remaining_seq_length = seq_length + 1 while remaining_seq_length != 0: doc_id = doc_idx[doc_idx_index] doc_length = sizes[doc_id] - doc_offset remaining_seq_length -= doc_length if remaining_seq_length <= 0: doc_offset += (remaining_seq_length + doc_length - 1) remaining_seq_length = 0 else: doc_idx_index += 1 doc_offset = 0 sample_idx[sample_index][0] = doc_idx_index sample_idx[sample_index][1] = doc_offset sample_index += 1 return sample_idx def _build_shuffle_idx(num_samples, total_size, np_rng): dtype_ = np.uint32 if total_size >= (np.iinfo(np.uint32).max - 1): dtype_ = np.int64 shuffle_idx_first = np.arange( start=0, stop=num_samples, step=1, dtype=dtype_) np_rng.shuffle(shuffle_idx_first) if num_samples == total_size: return shuffle_idx_first shuffle_idx_last = np.arange( start=num_samples, stop=total_size, step=1, dtype=dtype_) np_rng.shuffle(shuffle_idx_last) return np.concatenate((shuffle_idx_first, shuffle_idx_last)) class LM_Eval_Dataset(paddle.io.Dataset): def __init__(self, input_dir, max_seq_len, overlapping_eval=None, model_type="GPT", **kwargs): tokenizer_class, pretrained_name = MODEL_CLASSES[model_type] tokenizer = tokenizer_class.from_pretrained(pretrained_name) with open(input_dir, "rb") as reader: entire_data = reader.read().decode('utf-8') self.num_original_tokens = len(entire_data.strip().split(" ")) entire_data = self._wikitext_detokenizer(entire_data) self.tokens = tokenizer.encode(entire_data) self.num_tokenized_tokens = len(self.tokens) print('Original Tokens: %d, Detokenized tokens: %d' % (self.num_original_tokens, self.num_tokenized_tokens)) self.seq_len = max_seq_len self.pad_idx = tokenizer.eos_token_id self.overlapping_eval = overlapping_eval if self.overlapping_eval is None: self.overlapping_eval = self.seq_len self.overlapping_eval = max(1, self.overlapping_eval) self.total_targets = len(self.tokens) - 1 # remove first sequence tokens targets = max(self.total_targets - self.overlapping_eval, 0) self.total_sequences = max( math.ceil(targets / self.overlapping_eval) + 1, 1) def __len__(self): return self.total_sequences def _construct_sample(self, tokens): tokens = np.array(tokens).astype("int64").tolist() labels = tokens[1:] tokens = tokens[:-1] seq_length = len(tokens) # attention mask for the attention calulate attention_mask = np.tri(seq_length, seq_length).reshape( (1, seq_length, seq_length)) # the pad and eos tokens do not contribute the loss loss_mask = np.ones(seq_length, dtype="float32") loss_mask[tokens == self.pad_idx] = 0.0 position_ids = np.arange(0, seq_length, dtype="int64") # -INF mask value as default # attention_mask = (attention_mask - 1.0) * 1e9 # Bool mask of attention attention_mask = attention_mask.astype("float32") return [tokens, loss_mask, attention_mask, position_ids, labels] def __getitem__(self, idx): start_idx = idx * self.overlapping_eval end_idx = start_idx + self.seq_len tokens = self.tokens[start_idx:end_idx + 1] num_tokens = len(tokens) if num_tokens < self.seq_len + 1: num_pad = (self.seq_len + 1 - num_tokens) tokens += [self.pad_idx] * num_pad [tokens, loss_mask, attention_mask, position_ids, labels] = self._construct_sample(tokens) if self.overlapping_eval != self.seq_len and idx != 0: loss_mask[:-self.overlapping_eval] *= 0 return [tokens, loss_mask, attention_mask, position_ids, labels, \ np.array([self.num_original_tokens, self.num_tokenized_tokens])] def _wikitext_detokenizer(self, string): # contractions string = string.replace("s '", "s'") string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) # number separators string = string.replace(" @-@ ", "-") string = string.replace(" @,@ ", ",") string = string.replace(" @.@ ", ".") # punctuation string = string.replace(" : ", ": ") string = string.replace(" ; ", "; ") string = string.replace(" . ", ". ") string = string.replace(" ! ", "! ") string = string.replace(" ? ", "? ") string = string.replace(" , ", ", ") # double brackets string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) # miscellaneous string = string.replace("= = = =", "====") string = string.replace("= = =", "===") string = string.replace("= =", "==") string = string.replace(" " + chr(176) + " ", chr(176)) string = string.replace(" \n", "\n") string = string.replace("\n ", "\n") string = string.replace(" N ", " 1 ") string = string.replace(" 's", "'s") return string class Lambada_Eval_Dataset(paddle.io.Dataset): def __init__(self, input_dir, max_seq_len, model_type="GPT", **kwargs): tokenizer_class, pretrained_name = MODEL_CLASSES[model_type] tokenizer = tokenizer_class.from_pretrained(pretrained_name) tokenized_data = [] tokenized_label = [] with open(input_dir, 'r') as f: for line in f.readlines(): text = json.loads(line)['text'] tokens, labels = self._get_tokens(tokenizer, text) tokenized_data.append(tokens) tokenized_label.append(labels) self.pad_idx = tokenizer.eos_token_id self.seq_len = max_seq_len self.tokens = tokenized_data self.labels = tokenized_label def __len__(self): return len(self.tokens) def _construct_sample(self, tokens): tokens = np.array(tokens).astype("int64").tolist() labels = tokens[1:] tokens = tokens[:-1] seq_length = len(tokens) # attention mask for the attention calulate attention_mask = np.tri(seq_length, seq_length).reshape( (1, seq_length, seq_length)) # the pad and eos tokens do not contribute the loss position_ids = np.arange(0, seq_length, dtype="int64") # -INF mask value as default #attention_mask = (attention_mask - 1.0) * 1e9 # Bool mask of attention attention_mask = attention_mask.astype("float32") return [tokens, attention_mask, position_ids, labels] def __getitem__(self, idx): tokens = self.tokens[idx][:self.seq_len] labels = self.labels[idx] tokens = tokens + labels num_tokens = len(tokens) if num_tokens < self.seq_len + 1: num_pad = (self.seq_len + 1 - num_tokens) tokens += [self.pad_idx] * num_pad loss_mask = np.zeros(self.seq_len, dtype="float32") loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1. [tokens, attention_mask, position_ids, labels] = self._construct_sample(tokens) return [ tokens, loss_mask, attention_mask, position_ids, labels, np.array([self.__len__()]) ] def _get_tokens(self, tokenizer, text, strict=True): if not strict: tokens = tokenizer.encode(text) return tokens[:-1], [tokens[-1]] last_token = text.split()[-1] start_idx = text.rfind(last_token) beginning_tokens = tokenizer.encode(text[:start_idx].strip()) last_token = tokenizer.encode(' ' + last_token) return beginning_tokens, last_token ================================================ FILE: ppfleetx/data/dataset/multimodal_dataset.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time import gzip import random import base64 import numpy as np import blobfile as bf from random import randint, choice from tqdm import tqdm from io import BytesIO from pathlib import Path from copy import deepcopy import PIL from PIL import Image, ImageFile import paddle from paddle.io import Dataset, DataLoader from paddle.distributed import get_world_size from paddle.vision import transforms as T from ppfleetx.utils.log import logger def get_keys(data_path, gpu_num): files = [ file.strip() for file in open(data_path).readlines() if file.strip() != "" ] local_rank = paddle.distributed.get_rank() if len(files) % gpu_num == 0: keys_extend = list(files) else: added_num = gpu_num - (len(files) % gpu_num) try: keys_extend = files + random.sample(files, added_num) except: keys_extend = files + random.sample(files, 1) * added_num keys = keys_extend[local_rank::gpu_num] logger.info("keys: {} {}".format(keys, local_rank)) return keys class ImagenDataset(Dataset): def __init__(self, input_path, image_format='base64', shuffle=False, image_size=64, text_max_len=128, filter_image_resolution=128, tokenizer=None, sr=False, split='train', interpolation="bicubic", flip_p=0.5): super().__init__() device_world_size = paddle.distributed.get_world_size() self.filename = get_keys(input_path, gpu_num=device_world_size) if shuffle: random.shuffle(self.filename) self.filter_image_resolution = filter_image_resolution self.text_max_len = text_max_len self.split = split self.tokenizer = tokenizer self.sr = sr if sr: self.transform = T.Compose([T.Resize(image_size), T.ToTensor()]) self.for_line = self.get_line_for_line(self.filename).__iter__() self.good_index = [] self.interpolation = { "linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = T.RandomHorizontalFlip(prob=flip_p) self.image_size = image_size def load_path(self, data_path, f_index=None): if f_index is None: offset = 0 with open(data_path, 'rb') as f: for line in tqdm(f, desc='Loading data'): self.indexes.append((offset, len(line))) offset += len(line) else: offset = 0 with open(data_path, 'rb') as f: for line in tqdm(f, desc='Loading data'): self.indexes.append(((offset, len(line)), f_index)) offset += len(line) if self.split == 'train': random.shuffle(self.indexes) return @staticmethod def base64_to_image(base64_str): byte_data = base64.b64decode(base64_str) image_data = BytesIO(byte_data) img = Image.open(image_data) if img.mode != 'RGB': img = img.convert('RGB') return img def get_line_for_line(self, filename): while True: for fname in filename: if fname[-2:] != "gz": file = open(fname) for line in file: if line != "": data = line.strip().split('\t') image_base64 = data[4] image_item = self.base64_to_image(image_base64) if min(image_item.size) >= self.image_size: yield line else: file = gzip.GzipFile(fname, "r") for line in file: if line != "": line = line.decode() data = line.strip().split('\t') image_base64 = data[4] image_item = self.base64_to_image(image_base64) if min(image_item.size) >= self.image_size: yield line def __getitem__(self, index): if not isinstance(self.filename, list): data = self.for_line.__next__() else: data = self.for_line.__next__() data = data.strip().split('\t') # For laion 400m if len(data) == 6: image_base64 = data[4] caption = data[2] image_item = self.base64_to_image(image_base64) # Filter image resolution if min(image_item.size) < self.filter_image_resolution: return None if not self.sr: self.transform = T.Compose([ T.CenterCrop([min(image_item.size), min(image_item.size)]), T.Resize(64), T.ToTensor() ]) image_item = self.transform(image_item) else: img = np.array(image_item).astype(np.uint8) crop = min(img.shape[0], img.shape[1]) h, w, = img.shape[0], img.shape[1] if img.shape[0] > img.shape[1]: img = img[0:crop, (w - crop) // 2:(w + crop) // 2] else: img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:( w + crop) // 2] image = Image.fromarray(img) image = image.resize( (self.image_size, self.image_size), resample=self.interpolation) image_item = self.transform(image) example = {'id': index, 'image': image_item, 'caption': caption} return example def __len__(self): #return len(self.indexes) if self.sr: return 300000000 return 5000000 ================================================ FILE: ppfleetx/data/dataset/vision_dataset.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import os.path import copy import numpy as np from typing import Any, Callable, cast, Dict, List, Optional, Tuple import paddle from ppfleetx.utils.log import logger from ppfleetx.data.transforms.utils import create_preprocess_operators, transform __all__ = [ "GeneralClsDataset", "ImageFolder", "CIFAR10", "ContrativeLearningDataset", ] class GeneralClsDataset(paddle.io.Dataset): def __init__(self, image_root, cls_label_path, transform_ops=None, delimiter=" ", multi_label=False, class_num=None): if multi_label: assert class_num is not None, "Must set class_num when multi_label=True" self.multi_label = multi_label self.classes_num = class_num self._img_root = image_root self._cls_path = cls_label_path self.delimiter = delimiter self._transform_ops = None if transform_ops: self._transform_ops = create_preprocess_operators(transform_ops) self.images = [] self.labels = [] self._load_anno() def _load_anno(self): assert os.path.exists( self._cls_path), f"{self._cls_path} does not exists" assert os.path.exists( self._img_root), f"{self._img_root} does not exists" self.images = [] self.labels = [] with open(self._cls_path) as fd: lines = fd.readlines() for l in lines: l = l.strip().split(self.delimiter) self.images.append(os.path.join(self._img_root, l[0])) if self.multi_label: self.labels.append(l[1]) else: self.labels.append(np.int32(l[1])) assert os.path.exists(self.images[ -1]), f"{self.images[-1]} does not exists" def __getitem__(self, idx): try: with open(self.images[idx], 'rb') as f: img = f.read() if self._transform_ops: img = transform(img, self._transform_ops) if self.multi_label: one_hot = np.zeros([self.classes_num], dtype=np.float32) cls_idx = [int(e) for e in self.labels[idx].split(',')] for idx in cls_idx: one_hot[idx] = 1.0 return (img, one_hot) else: return (img, np.int32(self.labels[idx])) except Exception as ex: logger.error("Exception occured when parse line: {} with msg: {}". format(self.images[idx], ex)) rnd_idx = np.random.randint(self.__len__()) return self.__getitem__(rnd_idx) def __len__(self): return len(self.images) @property def class_num(self): if self.multi_label: return self.classes_num return len(set(self.labels)) IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") class ImageFolder(paddle.io.Dataset): """ Code ref from https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py A generic data loader where the images are arranged in this way by default: :: root/dog/xxx.png root/dog/xxy.png root/dog/[...]/xxz.png root/cat/123.png root/cat/nsdf3.png root/cat/[...]/asd932_.png This class inherits from :class:`~torchvision.datasets.DatasetFolder` so the same methods can be overridden to customize the dataset. Args: root (string): Root directory path. transform (callable, optional): A function/transform that takes in an PIL image and returns a transformed version. E.g, ``transforms.RandomCrop`` target_transform (callable, optional): A function/transform that takes in the target and transforms it. loader (callable, optional): A function to load an image given its path. is_valid_file (callable, optional): A function that takes path of an Image file and check if the file is a valid file (used to check of corrupt files) Attributes: classes (list): List of the class names sorted alphabetically. class_to_idx (dict): Dict with items (class_name, class_index). imgs (list): List of (image path, class_index) tuples """ def __init__(self, root, extensions=IMG_EXTENSIONS, transform_ops=None): self.root = root classes, class_to_idx = self.find_classes(self.root) samples = self.make_dataset(self.root, class_to_idx, extensions) logger.info( f'find total {len(classes)} classes and {len(samples)} images.') self.extensions = extensions self.classes = classes self.class_to_idx = class_to_idx self.imgs = samples self.targets = [s[1] for s in samples] self._transform_ops = None if transform_ops: self._transform_ops = create_preprocess_operators(transform_ops) @staticmethod def make_dataset( directory, class_to_idx, extensions=None, is_valid_file=None, ): """Generates a list of samples of a form (path_to_sample, class). Args: directory (str): root dataset directory, corresponding to ``self.root``. class_to_idx (Dict[str, int]): Dictionary mapping class name to class index. extensions (optional): A list of allowed extensions. Either extensions or is_valid_file should be passed. Defaults to None. is_valid_file (optional): A function that takes path of a file and checks if the file is a valid file (used to check of corrupt files) both extensions and is_valid_file should not be passed. Defaults to None. Raises: ValueError: In case ``class_to_idx`` is empty. ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None. FileNotFoundError: In case no valid file was found for any class. Returns: List[Tuple[str, int]]: samples of a form (path_to_sample, class) """ if class_to_idx is None: # prevent potential bug since make_dataset() would use the class_to_idx logic of the # find_classes() function, instead of using that of the find_classes() method, which # is potentially overridden and thus could have a different logic. raise ValueError("The class_to_idx parameter cannot be None.") directory = os.path.expanduser(directory) both_none = extensions is None and is_valid_file is None both_something = extensions is not None and is_valid_file is not None if both_none or both_something: raise ValueError( "Both extensions and is_valid_file cannot be None or not None at the same time" ) if extensions is not None: def is_valid_file(filename: str) -> bool: return filename.lower().endswith( extensions if isinstance(extensions, str) else tuple(extensions)) is_valid_file = cast(Callable[[str], bool], is_valid_file) instances = [] available_classes = set() for target_class in sorted(class_to_idx.keys()): class_index = class_to_idx[target_class] target_dir = os.path.join(directory, target_class) if not os.path.isdir(target_dir): continue for root, _, fnames in sorted( os.walk( target_dir, followlinks=True)): for fname in sorted(fnames): path = os.path.join(root, fname) if is_valid_file(path): item = path, class_index instances.append(item) if target_class not in available_classes: available_classes.add(target_class) empty_classes = set(class_to_idx.keys()) - available_classes if empty_classes: msg = f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. " if extensions is not None: msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}" raise FileNotFoundError(msg) return instances def find_classes(self, directory): """Find the class folders in a dataset structured as follows:: directory/ ├── class_x │ ├── xxx.ext │ ├── xxy.ext │ └── ... │ └── xxz.ext └── class_y ├── 123.ext ├── nsdf3.ext └── ... └── asd932_.ext This method can be overridden to only consider a subset of classes, or to adapt to a different dataset directory structure. Args: directory(str): Root directory path, corresponding to ``self.root`` Raises: FileNotFoundError: If ``dir`` has no class folders. Returns: (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index. """ classes = sorted( entry.name for entry in os.scandir(directory) if entry.is_dir()) if not classes: raise FileNotFoundError( f"Couldn't find any class folder in {directory}.") class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} return classes, class_to_idx def __getitem__(self, idx): try: path, target = self.imgs[idx] with open(path, 'rb') as f: img = f.read() if self._transform_ops: img = transform(img, self._transform_ops) return (img, np.int32(target)) except Exception as ex: logger.error("Exception occured when parse line: {} with msg: {}". format(path, ex)) rnd_idx = np.random.randint(self.__len__()) return self.__getitem__(rnd_idx) def __len__(self) -> int: return len(self.imgs) @property def class_num(self): return len(set(self.classes)) class CIFAR10(paddle.io.Dataset): def __init__( self, root, mode='train', transform_ops=None, ): self.root = root self.mode = mode assert self.mode in ['train', 'test'] self._transform_ops = None self.URL = 'https://dataset.bj.bcebos.com/cifar/cifar-10-python.tar.gz' if transform_ops: self._transform_ops = create_preprocess_operators(transform_ops) if not os.path.exists(os.path.join(self.root, f'data_batch_1')): from ppfleetx.utils.download import cached_path from ppfleetx.utils.file import untar zip_path = cached_path( self.URL, cache_dir=os.path.abspath(self.root)) untar( zip_path, mode="r:gz", out_dir=os.path.join(self.root, '..'), delete=True) # wait to download dataset if paddle.distributed.get_world_size() > 1: paddle.distributed.barrier() self.images = [] self.labels = [] self._load_anno() def _load_anno(self): def unpickle(file): import pickle with open(file, 'rb') as fo: dict = pickle.load(fo, encoding='bytes') return dict if self.mode == 'train': for idx in range(1, 6): path = os.path.join(self.root, f'data_batch_{idx}') ret = unpickle(path) data = ret[b'data'] labels = ret[b'labels'] for i in range(len(data)): img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0)) self.images.append(img) self.labels.append(labels[i]) else: path = os.path.join(self.root, f'test_batch') ret = unpickle(path) data = ret[b'data'] labels = ret[b'labels'] for i in range(len(data)): img = data[i].reshape((3, 32, 32)).transpose((1, 2, 0)) self.images.append(img) self.labels.append(labels[i]) def __getitem__(self, idx): img = self.images[idx] if self._transform_ops: img = transform(img, self._transform_ops) return (img, np.int32(self.labels[idx])) def __len__(self): return len(self.images) @property def class_num(self): return len(set(self.labels)) class ContrativeLearningDataset(ImageFolder): """ Code ref from https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py A generic data loader where the images are arranged in this way by default: :: root/dog/xxx.png root/dog/xxy.png root/dog/[...]/xxz.png root/cat/123.png root/cat/nsdf3.png root/cat/[...]/asd932_.png """ def __init__(self, root, extensions=IMG_EXTENSIONS, transform_ops=None): super(ContrativeLearningDataset, self).__init__( root, extensions=extensions, transform_ops=transform_ops) # remove unused attr del self.classes del self.class_to_idx del self.targets # only use image path self.imgs = [s[0] for s in self.imgs] def __getitem__(self, idx): try: path = self.imgs[idx] with open(path, 'rb') as f: img = f.read() if self._transform_ops: img1 = transform(img, self._transform_ops) img2 = transform(img, self._transform_ops) return img1, img2 except Exception as ex: logger.error("Exception occured when parse line: {} with msg: {}". format(path, ex)) rnd_idx = np.random.randint(self.__len__()) return self.__getitem__(rnd_idx) def __len__(self) -> int: return len(self.imgs) @property def class_num(self): raise NotImplementedError ================================================ FILE: ppfleetx/data/sampler/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .batch_sampler import * from .collate import Stack, Pad, Tuple, Dict ================================================ FILE: ppfleetx/data/sampler/batch_sampler.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function from __future__ import division import os import sys import numpy as np import math import paddle from paddle.io import DistributedBatchSampler from ppfleetx.distributed.apis import env __all__ = ["GPTBatchSampler", "DistributedBatchSampler"] class GPTBatchSampler(paddle.io.BatchSampler): """Sampler that restricts data loading to a subset of the dataset. In such case, each process can pass a DistributedBatchSampler instance as a DataLoader sampler, and load a subset of the original dataset that is exclusive to it. .. note:: Dataset is assumed to be of constant size. Args: dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement or other python object which implemented `__len__` for BatchSampler to get sample number of data source. batch_size(int): sample indice number in a mini-batch indices. num_replicas(int, optional): porcess number in distributed training. If :attr:`num_replicas` is None, :attr:`num_replicas` will be retrieved from :code:`paddle.distributed.ParallenEnv`. Default None. rank(int, optional): the rank of the current process among :attr:`num_replicas` processes. If :attr:`rank` is None, :attr:`rank` is retrieved from :code:`paddle.distributed.ParallenEnv`. Default None. shuffle(bool): whther to shuffle indices order before genrating batch indices. Default False. drop_last(bool): whether drop the last incomplete batch dataset size is not divisible by the batch size. Default False Examples: .. code-block:: python import numpy as np from paddle.io import Dataset, DistributedBatchSampler # init with dataset class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples def __getitem__(self, idx): image = np.random.random([784]).astype('float32') label = np.random.randint(0, 9, (1, )).astype('int64') return image, label def __len__(self): return self.num_samples dataset = RandomDataset(100) sampler = DistributedBatchSampler(dataset, batch_size=64) for data in sampler: # do something break """ def __init__(self, dataset, batch_size, num_replicas=None, rank=None, shuffle=False, drop_last=False, consumed_samples=0): self.dataset = dataset assert isinstance(batch_size, int) and batch_size > 0, \ "batch_size should be a positive integer" self.batch_size = batch_size assert isinstance(shuffle, bool), \ "shuffle should be a boolean value" self.shuffle = shuffle assert isinstance(drop_last, bool), \ "drop_last should be a boolean number" from paddle.distributed import ParallelEnv if num_replicas is not None: assert isinstance(num_replicas, int) and num_replicas > 0, \ "num_replicas should be a positive integer" self.nranks = num_replicas else: self.nranks = env.get_data_world_size() if rank is not None: assert isinstance(rank, int) and rank >= 0, \ "rank should be a non-negative integer" self.local_rank = rank else: self.local_rank = env.get_data_world_rank() self.drop_last = drop_last self.epoch = 0 self.consumed_samples = consumed_samples self.num_samples = int( math.ceil(len(self.dataset) * 1.0 / self.nranks)) self.total_size = self.num_samples * self.nranks def get_start_end_idx(self): start_idx = self.local_rank * self.batch_size end_idx = start_idx + self.batch_size return start_idx, end_idx def __iter__(self): assert self.consumed_samples % self.nranks == 0, \ "The consumed_samples should be divided by nranks. consumed_samples=%d, nranks=%s" % ( self.consumed_samples, self.nranks) self.remain_num_samples = int( math.ceil((len(self.dataset) - self.consumed_samples) * 1.0 / self.nranks)) self.remain_total_size = self.remain_num_samples * self.nranks self.batch_size_times_rank_size = self.batch_size * self.nranks num_samples = len(self.dataset) batch_indices = [] for idx in range(self.consumed_samples, self.total_size): if idx >= num_samples: batch_indices.append(idx - num_samples) else: batch_indices.append(idx) if len(batch_indices) == self.batch_size_times_rank_size: start_idx, end_idx = self.get_start_end_idx() yield batch_indices[start_idx:end_idx] batch_indices = [] if not self.drop_last and len(batch_indices) > 0: yield batch_indices def __len__(self): num_samples = self.num_samples num_samples += int(not self.drop_last) * (self.batch_size - 1) return num_samples // self.batch_size def set_epoch(self, epoch=0, consumed_samples=0): """ Sets the epoch number. When :attr:`shuffle=True`, this number is used as seeds of random numbers. By default, users may not set this, all replicas (workers) use a different random ordering for each epoch. If set same number at each epoch, this sampler will yield the same ordering at all epoches. Arguments: epoch (int): Epoch number. Examples: .. code-block:: python from paddle.io import Dataset, DistributedBatchSampler # init with dataset class RandomDataset(Dataset): def __init__(self, num_samples): self.num_samples = num_samples def __getitem__(self, idx): image = np.random.random([784]).astype('float32') label = np.random.randint(0, 9, (1, )).astype('int64') return image, label def __len__(self): return self.num_samples dataset = RandomDataset(100) sampler = DistributedBatchSampler(dataset, batch_size=64) for epoch in range(10): sampler.set_epoch(epoch) """ self.epoch = epoch # if we reset the epoch, the consumed_samples should be set to 0. self.consumed_samples = consumed_samples ================================================ FILE: ppfleetx/data/sampler/collate.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle __all__ = [ 'Stack', 'Pad', 'Tuple', 'Dict', ] class Stack(object): """ Stacks the input data samples to construct the batch. The N input samples must have the same shape/length and will be stacked to construct a batch. Args: axis (int, optional): The axis in the result data along which the input data are stacked. Default: 0. dtype (str|numpy.dtype, optional): The value type of the output. If it is set to None, the type of input data is used. Default: None. """ def __init__(self, axis=0, dtype=None): self._axis = axis self._dtype = dtype def __call__(self, data): """ Batchifies the input data by stacking. Args: data (list[numpy.ndarray]): The input data samples. It is a list. Each element is a numpy.ndarray or list. Returns: numpy.ndarray: Stacked batch data. Example: .. code-block:: python from paddlenlp.data import Stack a = [1, 2, 3, 4] b = [3, 4, 5, 6] c = [5, 6, 7, 8] result = Stack()([a, b, c]) ''' [[1, 2, 3, 4], [3, 4, 5, 6], [5, 6, 7, 8]] ''' """ data = np.stack( data, axis=self._axis).astype(self._dtype) if self._dtype else np.stack( data, axis=self._axis) return data class Pad(object): """ Pads the input data samples to the largest length at `axis`. Args: pad_val (float|int, optional): The padding value. Default: 0. axis (int, optional): The axis to pad the arrays. The arrays will be padded to the largest length at `axis`. For example, assume the input arrays have shape (10, 8, 5), (6, 8, 5), (3, 8, 5) and the axis is 0. Each input will be padded into (10, 8, 5) and then stacked to form the final output, which has shape (3, 10, 8, 5). Default: 0. ret_length (bool|numpy.dtype, optional): If it is bool, indicate whether to return the valid length in the output, and the data type of returned length is int32 if True. If it is numpy.dtype, indicate the data type of returned length. Default: None. dtype (numpy.dtype, optional): The value type of the output. If it is set to None, the input data type is used. Default: None. pad_right (bool, optional): Whether the padding direction is right-side. If True, it indicates we pad to the right side, while False indicates we pad to the left side. Default: True. """ def __init__(self, pad_val=0, axis=0, ret_length=None, dtype=None, pad_right=True): self._pad_val = pad_val self._axis = axis self._ret_length = ret_length self._dtype = dtype self._pad_right = pad_right def __call__(self, data): """ Batchifies the input data by padding. The input will be padded to the largest dimension at `axis` and then stacked to form the final output. In addition, the function will output the original dimensions at the `axis` if `ret_length` is not None or False. Args: data (list[numpy.ndarray|list]): The input data samples. It is a list. Each element is a numpy.ndarray or list. Returns: numpy.ndarray|tuple[numpy.ndarray]: If `ret_length` is False, it is a numpy.ndarray representing the padded batch data and the shape is (N, …). Otherwise, it is a tuple, besides the padded batch data, the tuple also includes a numpy.ndarray representing original length at `axis` of all input samples, which shaped `(N,)`. Example: .. code-block:: python from paddlenlp.data import Pad a = [1, 2, 3, 4] b = [5, 6, 7] c = [8, 9] result = Pad(pad_val=0)([a, b, c]) ''' [[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]] ''' """ # return data itself for rare unexpected cases when 1-D array is passed to Pad if not isinstance(data[0], list) and not isinstance(data[0], np.ndarray): return np.asarray( data, dtype=self._dtype if self._dtype is not None else np.int64) arrs = [np.asarray(ele) for ele in data] original_length = [ele.shape[self._axis] for ele in arrs] max_size = max(original_length) ret_shape = list(arrs[0].shape) ret_shape[self._axis] = max_size ret_shape = (len(arrs), ) + tuple(ret_shape) ret = np.full( shape=ret_shape, fill_value=self._pad_val, dtype=arrs[0].dtype if self._dtype is None else self._dtype) for i, arr in enumerate(arrs): if arr.shape[self._axis] == max_size: ret[i] = arr else: slices = [slice(None) for _ in range(arr.ndim)] if self._pad_right: slices[self._axis] = slice(0, arr.shape[self._axis]) else: slices[self._axis] = slice( max_size - arr.shape[self._axis], max_size) if slices[self._axis].start != slices[self._axis].stop: slices = [slice(i, i + 1)] + slices ret[tuple(slices)] = arr if self._ret_length: return ret, np.asarray( original_length, dtype="int32") if self._ret_length == True else np.asarray( original_length, self._ret_length) else: return ret class Tuple(object): """ Wraps multiple batchify functions together. The input functions will be applied to the corresponding input fields. Each sample should be a list or tuple containing multiple fields. The i'th batchify function stored in Tuple will be applied on the i'th field. For example, when data sample is (nd_data, label), you can wrap two batchify functions using `Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and label correspondingly. Args: fn (callable|list[callable]|tuple[callable]): The batchify functions to wrap. It is a callable function or a list/tuple of callable functions. args (tuple[callable]): The additional batchify functions to wrap. """ def __init__(self, fn, *args): if isinstance(fn, (list, tuple)): assert len(args) == 0, 'Input pattern not understood. The input of Tuple can be ' \ 'Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). ' \ 'Received fn=%s, args=%s' % (str(fn), str(args)) self._fn = fn else: self._fn = (fn, ) + args for i, ele_fn in enumerate(self._fn): assert callable( ele_fn ), 'Batchify functions must be callable! type(fn[%d]) = %s' % ( i, str(type(ele_fn))) def __call__(self, data): """ Batchifies data samples by applying each function on the corresponding data field, and each data field is produced by stacking the field data of samples. Args: data (list|tuple): The samples to batchfy. Each sample in list/tuple should contain `N` fields. Returns: tuple: A tuple composed of results from all including batchifying functions. Example: .. code-block:: python from paddlenlp.data import Stack, Pad, Tuple data = [ [[1, 2, 3, 4], [1]], [[5, 6, 7], [0]], [[8, 9], [1]], ] batchify_fn = Tuple(Pad(pad_val=0), Stack()) ids, label = batchify_fn(data) ''' ids: [[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]] label: [[1], [0], [1]] ''' """ assert len(data[0]) == len(self._fn),\ 'The number of attributes in each data sample should contain' \ ' {} elements'.format(len(self._fn)) ret = [] for i, ele_fn in enumerate(self._fn): result = ele_fn([ele[i] for ele in data]) if isinstance(result, (tuple, list)): ret.extend(result) else: ret.append(result) return tuple(ret) class Dict(object): """ Wraps multiple batchify functions together. The input functions will be applied to the corresponding input fields. Each sample should be a dict containing multiple fields. Each batchify function with key stored in `Dict` will be applied on the field which has the same key. For example, when data sample is {'tokens': tokens, 'labels': labels}, you can wrap two batchify functions using `Dict({'tokens': DataBatchify, 'labels': LabelBatchify})` to batchify tokens and labels correspondingly. Args: fn (dict): The batchify functions to wrap. It is a dict, which values is callable functions. """ def __init__(self, fn): assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \ 'Received fn=%s' % (str(fn)) self._fn = fn for col_name, ele_fn in self._fn.items(): assert callable( ele_fn ), 'Batchify functions must be callable! type(fn[%d]) = %s' % ( col_name, str(type(ele_fn))) def __call__(self, data): """ Batchifies data samples by applying each function on the corresponding data field, and each data field is produced by stacking the field data with the same key as batchify functions of all samples. Args: data (list[dict]|tuple[dict]): The samples to batchfy. Each sample in list/tuple is a dict with `N` key-values. Returns: tuple: A tuple composed of results from all including batchifying functions. Example: .. code-block:: python from paddlenlp.data import Stack, Pad, Dict data = [ {'labels':[1], 'token_ids':[1, 2, 3, 4]}, {'labels':[0], 'token_ids':[5, 6, 7]}, {'labels':[1], 'token_ids':[8, 9]}, ] batchify_fn = Dict({'token_ids':Pad(pad_val=0), 'labels':Stack()}) ids, label = batchify_fn(data) ''' ids: [[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]] label: [[1], [0], [1]] ''' """ ret = [] for col_name, ele_fn in self._fn.items(): result = ele_fn([ele[col_name] for ele in data]) if isinstance(result, (tuple, list)): ret.extend(result) else: ret.append(result) return tuple(ret) ================================================ FILE: ppfleetx/data/tokenizers/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .gpt_tokenizer import GPTTokenizer, GPTChineseTokenizer from .ernie_tokenizer import get_ernie_tokenizer from .t5_tokenizer import get_t5_tokenizer from .debertav2_tokenizer import get_debertav2_tokenizer ================================================ FILE: ppfleetx/data/tokenizers/debertav2_tokenizer.py ================================================ # coding=utf-8 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization for DebertaV2.""" from __future__ import (absolute_import, division, print_function, unicode_literals) import os import json import copy import logging import warnings import regex as re import unicodedata import sentencepiece as sp from collections import OrderedDict, UserDict from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union from ppfleetx.utils.download import cached_path from ppfleetx.data.tokenizers.tokenization_utils_base import ( _LazyConfigMapping, AddedToken, TruncationStrategy, PaddingStrategy, BatchEncoding, SpecialTokensMixin) logger = logging.getLogger(__name__) MAX_LENGTH = 256 DEFAULT_DebertaV2_NAME = "projects/imagen/cache/deberta-v-xxlarge" # Slow tokenizers used to be saved in three separated files SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file FULL_TOKENIZER_FILE = "tokenizer.json" _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json") CONFIG_NAME = "config.json" def get_debertav2_tokenizer(name): tokenizer = DebertaV2Tokenizer.from_pretrained(name) return tokenizer def debertav2_tokenize(texts, tokenizer): encoded = tokenizer.batch_encode_plus( texts, return_tensors="paddle", padding='longest', max_length=MAX_LENGTH, truncation=True) input_ids = encoded.input_ids attn_mask = encoded.attention_mask return input_ids, attn_mask PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "microsoft/deberta-v2-xlarge": "https://huggingface.co/microsoft/deberta-v2-xlarge/resolve/main/spm.model", "microsoft/deberta-v2-xxlarge": "https://huggingface.co/microsoft/deberta-v2-xxlarge/resolve/main/spm.model", "microsoft/deberta-v2-xlarge-mnli": ("https://huggingface.co/microsoft/deberta-v2-xlarge-mnli/resolve/main/spm.model" ), "microsoft/deberta-v2-xxlarge-mnli": ("https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli/resolve/main/spm.model" ), } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "microsoft/deberta-v2-xlarge": 512, "microsoft/deberta-v2-xxlarge": 512, "microsoft/deberta-v2-xlarge-mnli": 512, "microsoft/deberta-v2-xxlarge-mnli": 512, } PRETRAINED_INIT_CONFIGURATION = { "microsoft/deberta-v2-xlarge": { "do_lower_case": False }, "microsoft/deberta-v2-xxlarge": { "do_lower_case": False }, "microsoft/deberta-v2-xlarge-mnli": { "do_lower_case": False }, "microsoft/deberta-v2-xxlarge-mnli": { "do_lower_case": False }, } VOCAB_FILES_NAMES = {"vocab_file": "spm.model"} class DebertaV2Tokenizer(SpecialTokensMixin): r""" Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). Args: vocab_file (`str`): [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (`bool`, *optional*, defaults to `False`): Whether or not to lowercase the input when tokenizing. bos_token (`string`, *optional*, defaults to `"[CLS]"`): The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. When building a sequence using special tokens, this is not the token that is used for the beginning of sequence. The token used is the `cls_token`. eos_token (`string`, *optional*, defaults to `"[SEP]"`): The end of sequence token. When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the `sep_token`. unk_token (`str`, *optional*, defaults to `"[UNK]"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. sep_token (`str`, *optional*, defaults to `"[SEP]"`): The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens. pad_token (`str`, *optional*, defaults to `"[PAD]"`): The token used for padding, for example when batching sequences of different lengths. cls_token (`str`, *optional*, defaults to `"[CLS]"`): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (`str`, *optional*, defaults to `"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. sp_model_kwargs (`dict`, *optional*): Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set: - `enable_sampling`: Enable subword regularization. - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - `nbest_size = {0,1}`: No sampling is performed. - `nbest_size > 1`: samples from the nbest_size results. - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling algorithm. - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "token_type_ids", "attention_mask"] padding_side = "right" truncation_side = "right" slow_tokenizer_class = None def __init__(self, vocab_file, do_lower_case=False, split_by_punct=False, bos_token="[CLS]", eos_token="[SEP]", unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", sp_model_kwargs=None, **kwargs): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.added_tokens_encoder: Dict[str, int] = {} self.added_tokens_decoder: Dict[int, str] = {} super().__init__( do_lower_case=do_lower_case, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" ) self.do_lower_case = do_lower_case self.split_by_punct = split_by_punct self.vocab_file = vocab_file self._tokenizer = SPMTokenizer( vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs) def __len__(self): """ Size of the full vocabulary with the added tokens. """ return self.vocab_size + len(self.added_tokens_encoder) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) use_auth_token = kwargs.pop("use_auth_token", None) revision = kwargs.pop("revision", None) subfolder = kwargs.pop("subfolder", None) from_pipeline = kwargs.pop("_from_pipeline", None) from_auto_class = kwargs.pop("_from_auto", False) commit_hash = kwargs.pop("_commit_hash", None) _raise_exceptions_for_missing_entries = False user_agent = { "file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__ } if from_pipeline is not None: user_agent["using_pipeline"] = from_pipeline pretrained_model_name_or_path = str(pretrained_model_name_or_path) vocab_files = {} init_configuration = {} is_local = os.path.isdir(pretrained_model_name_or_path) single_file_id = None if os.path.isfile( pretrained_model_name_or_path ): # or is_remote_url(pretrained_model_name_or_path): if len(cls.vocab_files_names) > 1: raise ValueError( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not " "supported for this tokenizer. Use a model identifier or the path to a directory instead." ) warnings.warn( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and " "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.", FutureWarning, ) file_id = list(cls.vocab_files_names.keys())[0] vocab_files[file_id] = pretrained_model_name_or_path single_file_id = file_id else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, } vocab_files = { ** cls.vocab_files_names, ** additional_files_names } if "tokenizer_file" in vocab_files: # Try to get the tokenizer config to see if there are versioned tokenizer files. fast_tokenizer_file = FULL_TOKENIZER_FILE resolved_config_file = cached_file( pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, proxies=proxies, use_auth_token=use_auth_token, revision=revision, local_files_only=local_files_only, subfolder=subfolder, user_agent=user_agent, _raise_exceptions_for_missing_entries=False, _raise_exceptions_for_connection_errors=False, _commit_hash=commit_hash, ) commit_hash = extract_commit_hash(resolved_config_file, commit_hash) if resolved_config_file is not None: with open( resolved_config_file, encoding="utf-8") as reader: tokenizer_config = json.load(reader) if "fast_tokenizer_files" in tokenizer_config: fast_tokenizer_file = get_fast_tokenizer_file( tokenizer_config["fast_tokenizer_files"]) vocab_files["tokenizer_file"] = fast_tokenizer_file # Get files from url, cache, or disk depending on the case resolved_vocab_files = {} unresolved_files = [] for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None elif single_file_id == file_id: if os.path.isfile(file_path): resolved_vocab_files[file_id] = file_path elif is_remote_url(file_path): resolved_vocab_files[file_id] = download_url( file_path, proxies=proxies) else: if subfolder is None: subfolder = "" path_or_repo_id = str(pretrained_model_name_or_path) if os.path.isdir(path_or_repo_id): resolved_file = os.path.join( os.path.join(path_or_repo_id, subfolder), file_path) if not os.path.isfile(resolved_file): if _raise_exceptions_for_missing_entries: raise EnvironmentError( f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout " f"'https://huggingface.co/{path_or_repo_id}/{revision}' for available files." ) else: resolved_file = None resolved_vocab_files[file_id] = resolved_file else: resolved_vocab_files[file_id] = cached_path( file_path, cache_dir=cache_dir, ) if len(unresolved_files) > 0: logger.info( f"Can't load following files from cache: {unresolved_files} and cannot check if these " "files are necessary for the tokenizer to operate.") if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): raise EnvironmentError( f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from " "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " f"containing all relevant files for a {cls.__name__} tokenizer." ) for file_id, file_path in vocab_files.items(): if file_id not in resolved_vocab_files: continue #if is_local: # logger.info(f"loading file {file_path}") #else: # logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}") return cls._from_pretrained( resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, use_auth_token=use_auth_token, cache_dir=cache_dir, local_files_only=local_files_only, _commit_hash=commit_hash, **kwargs, ) @classmethod def _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, use_auth_token=None, cache_dir=None, local_files_only=False, _commit_hash=None, **kwargs): # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json # file or if `from_slow` is set to True. from_slow = kwargs.get("from_slow", False) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None if from_slow: slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( copy.deepcopy(resolved_vocab_files), pretrained_model_name_or_path, copy.deepcopy(init_configuration), *init_inputs, use_auth_token=use_auth_token, cache_dir=cache_dir, local_files_only=local_files_only, _commit_hash=_commit_hash, **(copy.deepcopy(kwargs)), ) else: slow_tokenizer = None # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.pop( "tokenizer_config_file", None) if tokenizer_config_file is not None: with open( tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: init_kwargs = json.load(tokenizer_config_handle) # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers. config_tokenizer_class = init_kwargs.get("tokenizer_class") init_kwargs.pop("tokenizer_class", None) init_kwargs.pop("auto_map", None) saved_init_inputs = init_kwargs.pop("init_inputs", ()) if not init_inputs: init_inputs = saved_init_inputs else: config_tokenizer_class = None init_kwargs = init_configuration if config_tokenizer_class is None: try: config_dict = resolved_vocab_files.pop("config_file", CONFIG_NAME) config_dict = os.path.join(pretrained_model_name_or_path, config_dict) config_dict = cls._dict_from_json_file(config_dict) config_tokenizer_class = config_dict[ "tokenizer_class"] if "tokenizer_class" in config_dict else None except (OSError, ValueError, KeyError): # skip if an error occurred. config_dict = None if config_tokenizer_class is None: # Third attempt. If we have not yet found the original type of the tokenizer, # we are loading we see if we can infer it from the type of the configuration file from ppfleetx.data.tokenizers.tokenization_utils_base import TOKENIZER_MAPPING_NAMES # tests_ignore model_type = config_dict[ "model_type"] if "model_type" in config_dict else None if model_type is None: # Fallback: use pattern matching on the string. for pattern in TOKENIZER_MAPPING_NAMES.keys(): if pattern in str(pretrained_model_name_or_path): model_type = pattern break if model_type is not None: config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get( model_type, (None, None)) if config_tokenizer_class is None: config_tokenizer_class = config_tokenizer_class_fast if config_tokenizer_class is not None: if cls.__name__.replace( "Fast", "") != config_tokenizer_class.replace("Fast", ""): logger.warning( "The tokenizer class you load from this checkpoint is not the same type as the class this" " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you" f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called" f" from is '{cls.__name__}'.") # Update with newly provided kwargs init_kwargs.update(kwargs) # Convert AddedTokens serialized as dict to class instances def convert_added_tokens(obj: Union[AddedToken, Any]): if isinstance(obj, dict) and "__type" in obj and obj[ "__type"] == "AddedToken": obj.pop("__type") return AddedToken(**obj) elif isinstance(obj, (list, tuple)): return list(convert_added_tokens(o) for o in obj) elif isinstance(obj, dict): return {k: convert_added_tokens(v) for k, v in obj.items()} return obj init_kwargs = convert_added_tokens(init_kwargs) # Set max length if needed if pretrained_model_name_or_path in cls.max_model_input_sizes: # if we're using a pretrained model, ensure the tokenizer # wont index sequences longer than the number of positional embeddings model_max_length = cls.max_model_input_sizes[ pretrained_model_name_or_path] if model_max_length is not None and isinstance(model_max_length, (int, float)): model_max_length = min( init_kwargs.get("model_max_length", int(1e30)), model_max_length) # TODO(PVP) - uncomment following line in Transformers v5 # init_kwargs["model_max_length"] = model_max_length # TODO(PVP) - remove in Transformers v5 # --- init_kwargs[ "model_max_length"] = cls._eventually_correct_t5_max_length( pretrained_model_name_or_path, model_max_length, init_kwargs.get("model_max_length")) # --- # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path if slow_tokenizer is not None: init_kwargs["__slow_tokenizer"] = slow_tokenizer init_kwargs["name_or_path"] = pretrained_model_name_or_path # Instantiate tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) except OSError: raise OSError( "Unable to load vocabulary from file. " "Please check that the provided vocabulary is accessible and not corrupted." ) # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` # Removed: Now done at the base class level # tokenizer.init_inputs = init_inputs # tokenizer.init_kwargs = init_kwargs # If there is a complementary special token map, load it special_tokens_map_file = resolved_vocab_files.pop( "special_tokens_map_file", None) if special_tokens_map_file is not None: with open( special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: special_tokens_map = json.load(special_tokens_map_handle) for key, value in special_tokens_map.items(): if key in kwargs and kwargs[key]: # This value has already been redefined by the kwargs # We keep this new value and ignore the one stored in the special_tokens_map_file continue if isinstance(value, dict): value = AddedToken(**value) elif isinstance(value, list): value = [ AddedToken(**token) if isinstance(token, dict) else token for token in value ] setattr(tokenizer, key, value) # Add supplementary tokens. special_tokens = tokenizer.all_special_tokens if added_tokens_file is not None: with open( added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) # Sort added tokens by index added_tok_encoder_sorted = list( sorted( added_tok_encoder.items(), key=lambda x: x[1])) # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for # individual tokens would repeatedly rebuild a trie, which can be slow. is_last_special = None tokens = [] for token, index in added_tok_encoder_sorted: current_index = len(tokenizer) + len(tokens) if has_tokenizer_file and index != current_index and tokenizer.convert_tokens_to_ids( token) != index: # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the # index is the current length of the tokenizer (not in vocabulary) raise ValueError( f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found " f"{index}.") elif not has_tokenizer_file and index != current_index: # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the # current length of the tokenizer. raise ValueError( f"Non-consecutive added token '{token}' found. " f"Should have index {current_index} but has index {index} in saved vocabulary." ) is_special = bool(token in special_tokens) if is_last_special is None or is_last_special == is_special: tokens.append(token) else: tokenizer.add_tokens( tokens, special_tokens=is_last_special) tokens = [token] is_last_special = is_special if tokens: tokenizer.add_tokens(tokens, special_tokens=is_last_special) # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab added_tokens = tokenizer.sanitize_special_tokens() #if added_tokens: # logger.warning_advice( # "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" # " fine-tuned or trained." # ) return tokenizer @property def vocab_size(self): return len(self.vocab) @property def vocab(self): return self._tokenizer.vocab def get_vocab(self): vocab = self.vocab.copy() vocab.update(self.get_added_vocab()) return vocab @classmethod def _dict_from_json_file(cls, json_file): with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() return json.loads(text) def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" if self.do_lower_case: text = text.lower() return self._tokenizer.tokenize(text) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self._tokenizer.spm.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self._tokenizer.spm.IdToPiece( index) if index < self.vocab_size else self.unk_token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" return self._tokenizer.decode(tokens) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A DeBERTa sequence has the following format: - single sequence: [CLS] X [SEP] - pair of sequences: [CLS] A [SEP] B [SEP] Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ( [0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa sequence pair mask has the following format: ``` 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | ``` If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): add_prefix_space = kwargs.pop("add_prefix_space", False) if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str]=None) -> Tuple[str]: return self._tokenizer.save_pretrained( save_directory, filename_prefix=filename_prefix) def _eventual_warn_about_too_long_sequence(self, ids, max_length, verbose: bool): """ Depending on the input and internal state we might trigger a warning about a sequence that is too long for its corresponding model Args: ids (`List[str]`): The ids produced by the tokenization max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set) verbose (`bool`): Whether or not to print more information and warnings. """ if max_length is None and len(ids) > self.model_max_length and verbose: if not self.deprecation_warnings.get( "sequence-length-is-longer-than-the-specified-maximum", False): logger.warning( "Token indices sequence length is longer than the specified maximum sequence length " f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model " "will result in indexing errors") self.deprecation_warnings[ "sequence-length-is-longer-than-the-specified-maximum"] = True def _get_padding_truncation_strategies(self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs): """ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy and pad_to_max_length) and behaviors. """ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) # Backward compatibility for previous behavior, maybe we should deprecate it: # If you only set max_length, it activates truncation for max_length if max_length is not None and padding is False and truncation is False: if verbose: if not self.deprecation_warnings.get( "Truncation-not-explicitly-activated", False): logger.warning( "Truncation was not explicitly activated but `max_length` is provided a specific value, please" " use `truncation=True` to explicitly truncate examples to max length. Defaulting to" " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" " tokenizer you can select this strategy more precisely by providing a specific strategy to" " `truncation`.") self.deprecation_warnings[ "Truncation-not-explicitly-activated"] = True truncation = "longest_first" # Get padding strategy if padding is False and old_pad_to_max_length: if verbose: warnings.warn( "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " "maximal input size of the model (e.g. 512 for Bert).", FutureWarning, ) if max_length is None: padding_strategy = PaddingStrategy.LONGEST else: padding_strategy = PaddingStrategy.MAX_LENGTH elif padding is not False: if padding is True: if verbose: if max_length is not None and ( truncation is False or truncation == "do_not_truncate"): warnings.warn( "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " "To pad to max length, use `padding='max_length'`.") if old_pad_to_max_length is not False: warnings.warn( "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`." ) padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch elif not isinstance(padding, PaddingStrategy): padding_strategy = PaddingStrategy(padding) elif isinstance(padding, PaddingStrategy): padding_strategy = padding else: padding_strategy = PaddingStrategy.DO_NOT_PAD # Get truncation strategy if truncation is False and old_truncation_strategy != "do_not_truncate": if verbose: warnings.warn( "The `truncation_strategy` argument is deprecated and will be removed in a future version, use" " `truncation=True` to truncate examples to a max length. You can give a specific length with" " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input" " size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific" " truncation strategy selected among `truncation='only_first'` (will only truncate the first" " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the" " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence" " in the pairs).", FutureWarning, ) truncation_strategy = TruncationStrategy(old_truncation_strategy) elif truncation is not False: if truncation is True: truncation_strategy = ( TruncationStrategy.LONGEST_FIRST ) # Default to truncate the longest sequences in pairs of inputs elif not isinstance(truncation, TruncationStrategy): truncation_strategy = TruncationStrategy(truncation) elif isinstance(truncation, TruncationStrategy): truncation_strategy = truncation else: truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE # Set max length if needed if max_length is None: if padding_strategy == PaddingStrategy.MAX_LENGTH: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-pad-to-max_length", False): logger.warning( "Asking to pad to max_length but no maximum length is provided and the model has no" " predefined maximum length. Default to no padding." ) self.deprecation_warnings[ "Asking-to-pad-to-max_length"] = True padding_strategy = PaddingStrategy.DO_NOT_PAD else: max_length = self.model_max_length if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-truncate-to-max_length", False): logger.warning( "Asking to truncate to max_length but no maximum length is provided and the model has" " no predefined maximum length. Default to no truncation." ) self.deprecation_warnings[ "Asking-to-truncate-to-max_length"] = True truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE else: max_length = self.model_max_length # Test if we have a padding token if padding_strategy != PaddingStrategy.DO_NOT_PAD and ( not self.pad_token or self.pad_token_id < 0): raise ValueError( "Asking to pad but the tokenizer does not have a padding token. " "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." ) # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and padding_strategy != PaddingStrategy.DO_NOT_PAD and pad_to_multiple_of is not None and max_length is not None and (max_length % pad_to_multiple_of != 0)): raise ValueError( "Truncation and padding are both activated but " f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." ) return padding_strategy, truncation_strategy, max_length, kwargs def _pad(self, encoded_inputs, max_length=None, padding_strategy=PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of=None, return_attention_mask=None): """ Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. padding_strategy: PaddingStrategy to use for padding. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad The tokenizer padding sides are defined in self.padding_side: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability >= 7.5 (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names required_input = encoded_inputs[self.model_input_names[0]] if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and ( max_length % pad_to_multiple_of != 0): max_length = ( (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len( required_input) != max_length # Initialize attention mask if not present. if return_attention_mask and "attention_mask" not in encoded_inputs: encoded_inputs["attention_mask"] = [1] * len(required_input) if needs_to_be_padded: difference = max_length - len(required_input) if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs[ "attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference) if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs[ "special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[ 0]] = required_input + [self.pad_token_id] * difference elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [ 0 ] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = [ self.pad_token_type_id ] * difference + encoded_inputs["token_type_ids"] if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = [ 1 ] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[ 0]] = [self.pad_token_id] * difference + required_input else: raise ValueError("Invalid padding strategy:" + str( self.padding_side)) return encoded_inputs def pad( self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_attention_mask=None, return_tensors=None, verbose=True, ): """ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, `self.pad_token_id` and `self.pad_token_type_id`) If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of PyTorch tensors, you will lose the specific device of your tensors however. Args: encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention masks?](../glossary#attention-mask) return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - `'tf'`: Return TensorFlow `tf.constant` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects. - `'np'`: Return Numpy `np.ndarray` objects. verbose (`bool`, *optional*, defaults to `True`): Whether or not to print more information and warnings. """ # If we have a list of dicts, let's convert it in a dict of lists # We do this to allow using this method as a collate_fn function in PyTorch Dataloader if isinstance(encoded_inputs, (list, tuple)) and isinstance( encoded_inputs[0], Mapping): encoded_inputs = { key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys() } # The model's main input name, usually `input_ids`, has be passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method " f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" ) required_input = encoded_inputs[self.model_input_names[0]] if not required_input: if return_attention_mask: encoded_inputs["attention_mask"] = [] return encoded_inputs # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects # and rebuild them afterwards if no return_tensors is specified # Note that we lose the specific device the tensor may be on for PyTorch first_element = required_input[0] if isinstance(first_element, (list, tuple)): # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. for item in required_input: if len(item) != 0: first_element = item[0] break # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. if not isinstance(first_element, (int, list, tuple)): if is_tf_available() and _is_tensorflow(first_element): return_tensors = "tf" if return_tensors is None else return_tensors elif is_torch_available() and _is_torch(first_element): return_tensors = "pt" if return_tensors is None else return_tensors elif isinstance(first_element, np.ndarray): return_tensors = "np" if return_tensors is None else return_tensors else: raise ValueError( f"type of {first_element} unknown: {type(first_element)}. " "Should be one of a python, numpy, pytorch or tensorflow object." ) for key, value in encoded_inputs.items(): encoded_inputs[key] = to_py_obj(value) # Convert padding_strategy in PaddingStrategy padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( padding=padding, max_length=max_length, verbose=verbose) required_input = encoded_inputs[self.model_input_names[0]] if required_input and not isinstance(required_input[0], (list, tuple)): encoded_inputs = self._pad( encoded_inputs, max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) batch_size = len(required_input) assert all( len(v) == batch_size for v in encoded_inputs.values() ), "Some items in the output dictionary have a different batch size than others." if padding_strategy == PaddingStrategy.LONGEST: max_length = max(len(inputs) for inputs in required_input) padding_strategy = PaddingStrategy.MAX_LENGTH batch_outputs = {} for i in range(batch_size): inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) outputs = self._pad( inputs, max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) return BatchEncoding(batch_outputs, tensor_type=return_tensors) def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]` of zeros. """ eos = [self.eos_token_id] if token_ids_1 is None: return len(token_ids_0 + eos) * [0] return len(token_ids_0 + eos + token_ids_1 + eos) * [0] def _add_eos_if_not_present(self, token_ids): """Do not add eos again if user already added it.""" if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: warnings.warn( f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated" " eos tokens being added.") return token_ids else: return token_ids + [self.eos_token_id] def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0): """ Truncates a sequence pair in-place following the strategy. Args: ids (`List[int]`): Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids (`List[int]`, *optional*): Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. num_tokens_to_remove (`int`, *optional*, defaults to 0): Number of tokens to remove using the truncation strategy. truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): The strategy to follow for truncation. Can be: - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided. - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size). stride (`int`, *optional*, defaults to 0): If set to a positive number, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. Returns: `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair of sequences (or a batch of pairs) is provided. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] if not isinstance(truncation_strategy, TruncationStrategy): truncation_strategy = TruncationStrategy(truncation_strategy) overflowing_tokens = [] if truncation_strategy == TruncationStrategy.ONLY_FIRST or ( truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None): if len(ids) > num_tokens_to_remove: window_len = min(len(ids), stride + num_tokens_to_remove) if self.truncation_side == "left": overflowing_tokens = ids[:window_len] ids = ids[num_tokens_to_remove:] elif self.truncation_side == "right": overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] else: raise ValueError( f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'." ) else: error_msg = ( f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the first sequence has a length {len(ids)}. ") if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) elif truncation_strategy == TruncationStrategy.LONGEST_FIRST: logger.warning( "Be aware, overflowing tokens are not returned for the setting you have chosen," f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' " "truncation strategy. So the returned list will always be empty even if some " "tokens have been removed.") for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): if self.truncation_side == "right": ids = ids[:-1] elif self.truncation_side == "left": ids = ids[1:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) else: if self.truncation_side == "right": pair_ids = pair_ids[:-1] elif self.truncation_side == "left": pair_ids = pair_ids[1:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: if len(pair_ids) > num_tokens_to_remove: window_len = min(len(pair_ids), stride + num_tokens_to_remove) if self.truncation_side == "right": overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif self.truncation_side == "left": overflowing_tokens = pair_ids[:window_len] pair_ids = pair_ids[num_tokens_to_remove:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) else: logger.error( f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the second sequence has a length {len(pair_ids)}. " f"Please select another truncation strategy than {truncation_strategy}, " "for instance 'longest_first' or 'only_first'.") return (ids, pair_ids, overflowing_tokens) def num_special_tokens_to_add(self, pair: bool=False) -> int: """ Returns the number of added tokens when encoding a sequence with special tokens. This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair (`bool`, *optional*, defaults to `False`): Whether the number of added tokens should be computed in the case of a sequence pair or a single sequence. Returns: `int`: Number of special tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] return len( self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def prepare_for_model(self, ids, pair_ids=None, add_special_tokens=True, padding=False, truncation=False, max_length=None, stride=0, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, prepend_batch_axis=False, **kwargs): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids* different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an error. Args: ids (`List[int]`): Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids (`List[int]`, *optional*): Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. """ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, verbose=verbose, **kwargs, ) pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 if return_token_type_ids and not add_special_tokens: raise ValueError( "Asking to return token_type_ids while setting add_special_tokens to False " "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None.") if (return_overflowing_tokens and truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is not None): raise ValueError( "Not possible to return overflowing tokens for pair of sequences with the " "`longest_first`. Please select another truncation strategy than `longest_first`, " "for instance `only_second` or `only_first`.") # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( pair=pair) if add_special_tokens else 0) # Truncation: Handle max sequence length overflowing_tokens = [] if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation_strategy, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences( ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) # Build output dictionary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs[ "special_tokens_mask"] = self.get_special_tokens_mask( ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Check lengths self._eventual_warn_about_too_long_sequence( encoded_inputs["input_ids"], max_length, verbose) # Padding if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) batch_outputs = BatchEncoding( encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis) return batch_outputs def _batch_prepare_for_model( self, batch_ids_pairs, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_length=False, verbose=True, ): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: batch_ids_pairs: list of tokenized input ids or input ids pairs """ batch_outputs = {} for first_ids, second_ids in batch_ids_pairs: outputs = self.prepare_for_model( first_ids, second_ids, add_special_tokens=add_special_tokens, padding=PaddingStrategy.DO_NOT_PAD. value, # we pad in batch afterward truncation=truncation_strategy.value, max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=None, # We convert the whole batch to tensors at the end prepend_batch_axis=False, verbose=verbose, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) batch_outputs = self.pad( batch_outputs, padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) batch_outputs = BatchEncoding( batch_outputs, tensor_type=return_tensors) return batch_outputs def _get_padding_truncation_strategies(self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs): """ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy and pad_to_max_length) and behaviors. """ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) # Backward compatibility for previous behavior, maybe we should deprecate it: # If you only set max_length, it activates truncation for max_length if max_length is not None and padding is False and truncation is False: if verbose: if not self.deprecation_warnings.get( "Truncation-not-explicitly-activated", False): logger.warning( "Truncation was not explicitly activated but `max_length` is provided a specific value, please" " use `truncation=True` to explicitly truncate examples to max length. Defaulting to" " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" " tokenizer you can select this strategy more precisely by providing a specific strategy to" " `truncation`.") self.deprecation_warnings[ "Truncation-not-explicitly-activated"] = True truncation = "longest_first" # Get padding strategy if padding is False and old_pad_to_max_length: if verbose: warnings.warn( "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " "maximal input size of the model (e.g. 512 for Bert).", FutureWarning, ) if max_length is None: padding_strategy = PaddingStrategy.LONGEST else: padding_strategy = PaddingStrategy.MAX_LENGTH elif padding is not False: if padding is True: if verbose: if max_length is not None and ( truncation is False or truncation == "do_not_truncate"): warnings.warn( "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " "To pad to max length, use `padding='max_length'`.") if old_pad_to_max_length is not False: warnings.warn( "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`." ) padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch elif not isinstance(padding, PaddingStrategy): padding_strategy = PaddingStrategy(padding) elif isinstance(padding, PaddingStrategy): padding_strategy = padding else: padding_strategy = PaddingStrategy.DO_NOT_PAD # Get truncation strategy if truncation is False and old_truncation_strategy != "do_not_truncate": if verbose: warnings.warn( "The `truncation_strategy` argument is deprecated and will be removed in a future version, use" " `truncation=True` to truncate examples to a max length. You can give a specific length with" " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input" " size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific" " truncation strategy selected among `truncation='only_first'` (will only truncate the first" " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the" " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence" " in the pairs).", FutureWarning, ) truncation_strategy = TruncationStrategy(old_truncation_strategy) elif truncation is not False: if truncation is True: truncation_strategy = ( TruncationStrategy.LONGEST_FIRST ) # Default to truncate the longest sequences in pairs of inputs elif not isinstance(truncation, TruncationStrategy): truncation_strategy = TruncationStrategy(truncation) elif isinstance(truncation, TruncationStrategy): truncation_strategy = truncation else: truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE # Set max length if needed if max_length is None: if padding_strategy == PaddingStrategy.MAX_LENGTH: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-pad-to-max_length", False): logger.warning( "Asking to pad to max_length but no maximum length is provided and the model has no" " predefined maximum length. Default to no padding." ) self.deprecation_warnings[ "Asking-to-pad-to-max_length"] = True padding_strategy = PaddingStrategy.DO_NOT_PAD else: max_length = self.model_max_length if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-truncate-to-max_length", False): logger.warning( "Asking to truncate to max_length but no maximum length is provided and the model has" " no predefined maximum length. Default to no truncation." ) self.deprecation_warnings[ "Asking-to-truncate-to-max_length"] = True truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE else: max_length = self.model_max_length # Test if we have a padding token if padding_strategy != PaddingStrategy.DO_NOT_PAD and ( not self.pad_token or self.pad_token_id < 0): raise ValueError( "Asking to pad but the tokenizer does not have a padding token. " "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." ) # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and padding_strategy != PaddingStrategy.DO_NOT_PAD and pad_to_multiple_of is not None and max_length is not None and (max_length % pad_to_multiple_of != 0)): raise ValueError( "Truncation and padding are both activated but " f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." ) return padding_strategy, truncation_strategy, max_length, kwargs def batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens=True, padding=False, truncation=False, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs): """ Tokenize and prepare for the model a list of sequences or a list of pairs of sequences. This method is deprecated, `__call__` should be used instead. Args: batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`): Batch of sequences or pair of sequences to be encoded. This can be a list of string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see details in `encode_plus`). """ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, verbose=verbose, **kwargs, ) return self._batch_encode_plus( batch_text_or_text_pairs=batch_text_or_text_pairs, add_special_tokens=add_special_tokens, padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, return_length=return_length, verbose=verbose, **kwargs, ) def _batch_encode_plus( self, batch_text_or_text_pairs, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs): def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): if is_split_into_words: tokens = list( itertools.chain(*(self.tokenize( t, is_split_into_words=True, **kwargs) for t in text))) return self.convert_tokens_to_ids(tokens) else: return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " "To use this feature, change your tokenizer to one deriving from " "transformers.PreTrainedTokenizerFast.") input_ids = [] for ids_or_pair_ids in batch_text_or_text_pairs: if not isinstance(ids_or_pair_ids, (list, tuple)): ids, pair_ids = ids_or_pair_ids, None elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): ids, pair_ids = ids_or_pair_ids, None else: ids, pair_ids = ids_or_pair_ids first_ids = get_input_ids(ids) second_ids = get_input_ids( pair_ids) if pair_ids is not None else None input_ids.append((first_ids, second_ids)) batch_outputs = self._batch_prepare_for_model( input_ids, add_special_tokens=add_special_tokens, padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=return_tensors, verbose=verbose, ) return BatchEncoding(batch_outputs) def tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens, using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Takes care of added tokens. Args: text (`str`): The sequence to be encoded. **kwargs (additional keyword arguments): Passed along to the model-specific `prepare_for_tokenization` preprocessing method. Returns: `List[str]`: The list of tokens. """ # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors all_special_tokens_extended = dict( (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)) text, kwargs = self.prepare_for_tokenization(text, **kwargs) if kwargs: logger.warning(f"Keyword arguments {kwargs} not recognized.") # TODO: should this be in the base class? if hasattr(self, "do_lower_case") and self.do_lower_case: # convert non-special tokens to lowercase escaped_special_toks = [ re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) ] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) no_split_token = set(self.unique_no_split_tokens) tokens = self.tokens_trie.split(text) # ["This is something", "", " else"] for i, token in enumerate(tokens): if token in no_split_token: tok_extended = all_special_tokens_extended.get(token, None) left = tokens[i - 1] if i > 0 else None right = tokens[i + 1] if i < len(tokens) - 1 else None if isinstance(tok_extended, AddedToken): if tok_extended.rstrip and right: # A bit counter-intuitive but we strip the left of the string # since tok_extended.rstrip means the special token is eating all white spaces on its right tokens[i + 1] = right.lstrip() # Strip white spaces on the left if tok_extended.lstrip and left: tokens[i - 1] = left.rstrip() # Opposite here else: # We strip left and right by default if right: tokens[i + 1] = right.lstrip() if left: tokens[i - 1] = left.rstrip() # ["This is something", "", "else"] tokenized_text = [] for token in tokens: # Need to skip eventual empty (fully stripped) tokens if not token: continue if token in no_split_token: tokenized_text.append(token) else: tokenized_text.extend(self._tokenize(token)) # ["This", " is", " something", "", "else"] return tokenized_text class SPMTokenizer: r""" Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece). Args: vocab_file (`str`): [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that contains the vocabulary necessary to instantiate a tokenizer. sp_model_kwargs (`dict`, *optional*): Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set: - `enable_sampling`: Enable subword regularization. - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - `nbest_size = {0,1}`: No sampling is performed. - `nbest_size > 1`: samples from the nbest_size results. - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling algorithm. - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for BPE-dropout. """ def __init__(self, vocab_file, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]]=None): self.split_by_punct = split_by_punct self.vocab_file = vocab_file self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) if not os.path.exists(vocab_file): raise FileNotFoundError(f"{vocab_file} does not exist!") spm.load(vocab_file) bpe_vocab_size = spm.GetPieceSize() # Token map # 0+1 # 1+1 # 2+1 self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)} self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)] # self.vocab['[PAD]'] = 0 # self.vocab['[CLS]'] = 1 # self.vocab['[SEP]'] = 2 # self.vocab['[UNK]'] = 3 self.spm = spm def __getstate__(self): state = self.__dict__.copy() state["spm"] = None return state def __setstate__(self, d): self.__dict__ = d # for backward compatibility if not hasattr(self, "sp_model_kwargs"): self.sp_model_kwargs = {} self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) self.spm.Load(self.vocab_file) def tokenize(self, text): return self._encode_as_pieces(text) def convert_ids_to_tokens(self, ids): tokens = [] for i in ids: tokens.append(self.ids_to_tokens[i]) return tokens def decode(self, tokens, start=-1, end=-1, raw_text=None): if raw_text is None: return self.spm.decode_pieces([t for t in tokens]) else: words = self.split_to_words(raw_text) word_tokens = [self.tokenize(w) for w in words] token2words = [0] * len(tokens) tid = 0 for i, w in enumerate(word_tokens): for k, t in enumerate(w): token2words[tid] = i tid += 1 word_start = token2words[start] word_end = token2words[end] if end < len(tokens) else len(words) text = "".join(words[word_start:word_end]) return text def add_special_token(self, token): if token not in self.special_tokens: self.special_tokens.append(token) if token not in self.vocab: self.vocab[token] = len(self.vocab) - 1 self.ids_to_tokens.append(token) return self.id(token) def part_of_whole_word(self, token, is_bos=False): if is_bos: return True if (len(token) == 1 and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))) or token in self.special_tokens: return False word_start = b"\xe2\x96\x81".decode("utf-8") return not token.startswith(word_start) def pad(self): return "[PAD]" def bos(self): return "[CLS]" def eos(self): return "[SEP]" def unk(self): return "[UNK]" def mask(self): return "[MASK]" def sym(self, id): return self.ids_to_tokens[id] def id(self, sym): return self.vocab[sym] if sym in self.vocab else 1 def _encode_as_pieces(self, text): text = convert_to_unicode(text) if self.split_by_punct: words = self._run_split_on_punc(text) pieces = [self.spm.encode(w, out_type=str) for w in words] return [p for w in pieces for p in w] else: return self.spm.encode(text, out_type=str) def split_to_words(self, text): pieces = self._encode_as_pieces(text) word_start = b"\xe2\x96\x81".decode("utf-8") words = [] offset = 0 prev_end = 0 for i, p in enumerate(pieces): if p.startswith(word_start): if offset > prev_end: words.append(text[prev_end:offset]) prev_end = offset w = p.replace(word_start, "") else: w = p try: s = text.index(w, offset) pn = "" k = i + 1 while k < len(pieces): pn = pieces[k].replace(word_start, "") if len(pn) > 0: break k += 1 if len(pn) > 0 and pn in text[offset:s]: offset = offset + 1 else: offset = s + len(w) except Exception: offset = offset + 1 if prev_end < offset: words.append(text[prev_end:offset]) return words def _run_strip_accents(self, text): """Strips accents from a piece of text.""" text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output) def _run_split_on_punc(self, text): """Splits punctuation on a piece of text.""" chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output] def save_pretrained(self, path: str, filename_prefix: str=None): filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]] if filename_prefix is not None: filename = filename_prefix + "-" + filename full_path = os.path.join(path, filename) with open(full_path, "wb") as fs: fs.write(self.spm.serialized_model_proto()) return (full_path, ) def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" # \t, \n, and \r are technically control characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `chars` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def _is_punctuation(char): """Checks whether `chars` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" if isinstance(text, str): return text elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: raise ValueError(f"Unsupported string type: {type(text)}") ================================================ FILE: ppfleetx/data/tokenizers/ernie_tokenizer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from paddlenlp.transformers import ErnieTokenizer tokenizer = None def get_ernie_tokenizer(tokenizer_type): global tokenizer if tokenizer is None: tokenizer = ErnieTokenizer.from_pretrained(tokenizer_type) return tokenizer ================================================ FILE: ppfleetx/data/tokenizers/gpt_tokenizer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" from __future__ import (absolute_import, division, print_function, unicode_literals) import sys import json import logging import warnings import os import regex as re from io import open from ppfleetx.utils.download import cached_path try: from functools import lru_cache except ImportError: # Just a dummy decorator to get the checks to run on python2 # because honestly I don't want to support a byte-level unicode BPE # tokenizer on python 2 right now. def lru_cache(): return lambda func: func from ppfleetx.utils.log import logger try: import paddlenlp from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer except ImportError: raise ImportError("Please import paddlenlp before running the GPT tasks.") PRETRAINED_VOCAB_ARCHIVE_MAP = { 'gpt2': "http://fleet.bj.bcebos.com/datasets/gpt/gpt2-vocab.json", } PRETRAINED_MERGES_ARCHIVE_MAP = { 'gpt2': "http://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt", } PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {'gpt2': 1024, } VOCAB_NAME = 'vocab.json' MERGES_NAME = 'merges.txt' SPECIAL_TOKENS_NAME = 'special_tokens.txt' @lru_cache() def bytes_to_unicode(): """ Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ _chr = unichr if sys.version_info[0] == 2 else chr bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \ list(range(ord("®"), ord("ÿ") + 1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [_chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs class GPTTokenizer(object): """ GPT-2 BPE tokenizer. Peculiarities: - Byte-level BPE """ padding_side = "right" truncation_side = "right" model_input_names = ["input_ids", "token_type_ids", "attention_mask"] pad_token_type_id = 0 pad_token_id = 0 @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ pretrained_model_name_or_path] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[ pretrained_model_name_or_path] special_tokens_file = None else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) if not os.path.exists(special_tokens_file): special_tokens_file = None else: logger.info("loading special tokens file {}".format( special_tokens_file)) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_merges_file = cached_path( merges_file, cache_dir=cache_dir) except Exception as e: logger.info(e) logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find files {} and {} " "at this path or url.".format( pretrained_model_name_or_path, ', '.join( PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, vocab_file, merges_file)) return None if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading merges file {}".format(merges_file)) else: logger.info("loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file)) logger.info("loading merges file {} from cache at {}".format( merges_file, resolved_merges_file)) if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. if special_tokens_file and 'special_tokens' not in kwargs: special_tokens = open( special_tokens_file, encoding='utf-8').read().split('\n')[:-1] else: special_tokens = kwargs.pop('special_tokens', []) tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) return tokenizer def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None, **kwargs): self.padding_side = kwargs.pop("padding_side", self.padding_side) if self.padding_side not in ["right", "left"]: raise ValueError( f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" ) self.truncation_side = kwargs.pop("truncation_side", self.truncation_side) if self.truncation_side not in ["right", "left"]: raise ValueError( f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}" ) self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file)) self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_data] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} # Should haved added re.IGNORECASE so BPE merges can happen for # capitalized versions of contractions self.eod_id = self.encoder['<|endoftext|>'] self.pat = re.compile( r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens) def __call__(self, text, text_pair=None, add_special_tokens=True, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_length=False): assert padding in [True, False, "longest", "max_length", "do_not_pad"] if max_length is not None and padding is False and truncation is False: truncation = "longest_first" if padding is True: padding = "longest" elif padding is False: padding = "do_not_pad" assert truncation in [ True, False, "only_first", "only_second", "longest_first", "do_not_truncate" ] if truncation is True: truncation = "longest_first" elif truncation is False: truncation = "do_not_truncate" # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided if (truncation != "do_not_truncate" and padding != "do_not_pad" and pad_to_multiple_of is not None and max_length is not None and (max_length % pad_to_multiple_of != 0)): raise ValueError( "Truncation and padding are both activated but " f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." ) is_batched = isinstance(text, (list, tuple)) if is_batched: raise NotImplementedError else: return self.encode_plus( text=text, text_pair=text_pair, add_special_tokens=add_special_tokens, padding=padding, truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_length=return_length) def encode_plus(self, text, text_pair, add_special_tokens=True, padding="do_not_pad", truncation="do_not_truncate", max_length=None, pad_to_multiple_of=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_length=False, **kwargs): def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): if is_split_into_words: tokens = list( itertools.chain(*(self.tokenize( t, is_split_into_words=True, **kwargs) for t in text))) return self.convert_tokens_to_ids(tokens) else: return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], int): return text else: raise NotImplementedError first_ids = get_input_ids(text) second_ids = get_input_ids( text_pair) if text_pair is not None else None pair = bool(second_ids is not None) len_ids = len(first_ids) len_pair_ids = len(second_ids) if pair else 0 if return_token_type_ids and not add_special_tokens: raise ValueError( "Asking to return token_type_ids while setting add_special_tokens to False " "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None.") # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( pair=pair) if add_special_tokens else 0) # Truncation: Handle max sequence length overflowing_tokens = [] if truncation != "do_not_truncate" and max_length and total_len > max_length: first_ids, second_ids, overflowing_tokens = self.truncate_sequences( first_ids, pair_ids=second_ids, num_tokens_to_remove=total_len - max_length, truncation=truncation, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(first_ids, second_ids) token_type_ids = self.create_token_type_ids_from_sequences( first_ids, second_ids) else: sequence = first_ids + second_ids if pair else first_ids token_type_ids = [0] * len(first_ids) + ([0] * len(second_ids) if pair else []) # Build output dictionary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids # Padding if padding != "do_not_pad" or return_attention_mask: encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) return encoded_inputs def num_special_tokens_to_add(self, pair: bool=False) -> int: token_ids_0 = [] token_ids_1 = [] return len( self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): if token_ids_1 is None: return token_ids_0 return token_ids_0 + token_ids_1 def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): if token_ids_1 is None: return len(token_ids_0) * [0] return [0] * len(token_ids_0) + [1] * len(token_ids_1) def truncate_sequences( self, ids, pair_ids=None, num_tokens_to_remove=0, truncation="longest_first", stride=0, ): if num_tokens_to_remove <= 0: return ids, pair_ids, [] overflowing_tokens = [] if truncation == "only_first" or (truncation == "longest_first" and pair_ids is None): if len(ids) > num_tokens_to_remove: window_len = min(len(ids), stride + num_tokens_to_remove) if self.truncation_side == "left": overflowing_tokens = ids[:window_len] ids = ids[num_tokens_to_remove:] elif self.truncation_side == "right": overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] else: raise ValueError( f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'." ) else: error_msg = ( f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the first sequence has a length {len(ids)}. ") if truncation == "only_first": error_msg = ( error_msg + "Please select another truncation strategy than " f"{truncation}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) elif truncation == "longest_first": warnings.warn( "Be aware, overflowing tokens are not returned for the setting you have chosen," f" i.e. sequence pairs with the '{truncation}' " "truncation strategy. So the returned list will always be empty even if some " "tokens have been removed.") for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): if self.truncation_side == "right": ids = ids[:-1] elif self.truncation_side == "left": ids = ids[1:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) else: if self.truncation_side == "right": pair_ids = pair_ids[:-1] elif self.truncation_side == "left": pair_ids = pair_ids[1:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) elif truncation == "only_second" and pair_ids is not None: if len(pair_ids) > num_tokens_to_remove: window_len = min(len(pair_ids), stride + num_tokens_to_remove) if self.truncation_side == "right": overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif self.truncation_side == "left": overflowing_tokens = pair_ids[:window_len] pair_ids = pair_ids[num_tokens_to_remove:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) else: logger.error( f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the second sequence has a length {len(pair_ids)}. " f"Please select another truncation strategy than {truncation}, " "for instance 'longest_first' or 'only_first'.") return (ids, pair_ids, overflowing_tokens) def pad( self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_attention_mask=None, return_tensors=None, verbose=True, ): # The model's main input name, usually `input_ids`, has be passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method " f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" ) required_input = encoded_inputs[self.model_input_names[0]] if not required_input: if return_attention_mask: encoded_inputs["attention_mask"] = [] return encoded_inputs required_input = encoded_inputs[self.model_input_names[0]] if required_input and not isinstance(required_input[0], (list, tuple)): encoded_inputs = self._pad( encoded_inputs, max_length=max_length, padding=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return encoded_inputs batch_size = len(required_input) assert all( len(v) == batch_size for v in encoded_inputs.values() ), "Some items in the output dictionary have a different batch size than others." if padding == "longest": max_length = max(len(inputs) for inputs in required_input) padding = "max_length" batch_outputs = {} for i in range(batch_size): inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) outputs = self._pad( inputs, max_length=max_length, padding=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) return encoded_inputs def _pad( self, encoded_inputs, max_length=None, padding="do_not_pad", pad_to_multiple_of=None, return_attention_mask=None, ) -> dict: # Load from model defaults if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names or "attention_mask" in encoded_inputs required_input = encoded_inputs[self.model_input_names[0]] if padding == "longest": max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and ( max_length % pad_to_multiple_of != 0): max_length = ( (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding != "do_not_pad" and len( required_input) != max_length # Initialize attention mask if not present. if return_attention_mask and "attention_mask" not in encoded_inputs: encoded_inputs["attention_mask"] = [1] * len(required_input) if needs_to_be_padded: difference = max_length - len(required_input) if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs[ "attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference) if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs[ "special_tokens_mask"] + [1] * difference if "offset_mapping" in encoded_inputs: encoded_inputs["offset_mapping"] = encoded_inputs[ "offset_mapping"] + [(0, 0)] * difference if "position_ids" in encoded_inputs: encoded_inputs["position_ids"] = encoded_inputs[ "position_ids"] + [0] * difference encoded_inputs[self.model_input_names[ 0]] = required_input + [self.pad_token_id] * difference elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [ 0 ] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = [ self.pad_token_type_id ] * difference + encoded_inputs["token_type_ids"] if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = [ 1 ] * difference + encoded_inputs["special_tokens_mask"] if "offset_mapping" in encoded_inputs: encoded_inputs["offset_mapping"] = [ (0, 0) ] * difference + encoded_inputs["offset_mapping"] if "position_ids" in encoded_inputs: encoded_inputs["position_ids"] = [ 0 ] * difference + encoded_inputs["position_ids"] encoded_inputs[self.model_input_names[ 0]] = [self.pad_token_id] * difference + required_input else: raise ValueError("Invalid padding strategy:" + str( self.padding_side)) return encoded_inputs def __len__(self): return len(self.encoder) + len(self.special_tokens) def set_special_tokens(self, special_tokens): """ Add a list of additional tokens to the encoder. The additional tokens are indexed starting from the last index of the current vocabulary in the order of the `special_tokens` list. """ if not special_tokens: self.special_tokens = {} self.special_tokens_decoder = {} return self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) self.special_tokens_decoder = { v: k for k, v in self.special_tokens.items() } logger.info("Special tokens {}".format(self.special_tokens)) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) pairs = get_pairs(word) if not pairs: return token while True: bigram = min( pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except BaseException: new_word.extend(word[i:]) break if word[i] == first and i < len(word) - 1 and word[ i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) self.cache[token] = word return word def tokenize(self, text): """ Tokenize a string. """ bpe_tokens = [] for token in re.findall(self.pat, text): if sys.version_info[0] == 2: token = ''.join(self.byte_encoder[ord(b)] for b in token) else: token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend( bpe_token for bpe_token in self.bpe(token).split(' ')) return bpe_tokens def convert_tokens_to_ids(self, tokens): """ Converts a sequence of tokens into ids using the vocab. """ ids = [] if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): if tokens in self.special_tokens: return self.special_tokens[tokens] else: return self.encoder.get(tokens, 0) for token in tokens: if token in self.special_tokens: ids.append(self.special_tokens[token]) else: ids.append(self.encoder.get(token, 0)) if len(ids) > self.max_len: warnings.warn( "Token indices sequence length is longer than the specified maximum " " sequence length for this OpenAI GPT model ({} > {}). Running this" " sequence through the model will result in indexing errors". format(len(ids), self.max_len)) return ids def convert_ids_to_string(self, ids): """ Converts a single index or a sequence of indices to texts. Args: ids (int|List[int]): The token id (or token ids) to be converted to text. Returns: str: The decoded text. Example: .. code-block:: from paddlenlp.transformers import GPTTokenizer tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en') print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930])) # 'Welcome to use PaddlePaddle and PaddleNLP' """ text = ''.join([self.decoder[id] for id in ids]) text = bytearray([self.byte_decoder[c] for c in text]).decode( 'utf-8', errors=self.errors) return text def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """Converts a sequence of ids in BPE tokens using the vocab.""" tokens = [] for i in ids: if i in self.special_tokens_decoder: if not skip_special_tokens: tokens.append(self.special_tokens_decoder[i]) else: tokens.append(self.decoder[i]) return tokens def encode(self, text): return self.convert_tokens_to_ids(self.tokenize(text)) def decode(self, tokens): text = ''.join([ self.decoder[token] if token in self.decoder.keys() else '' for token in tokens ]) text = bytearray([self.byte_decoder[c] for c in text]).decode( 'utf-8', errors=self.errors) return text def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary and merge files to a directory.""" if not os.path.isdir(vocab_path): logger.error("Vocabulary path ({}) should be a directory".format( vocab_path)) return vocab_file = os.path.join(vocab_path, VOCAB_NAME) merge_file = os.path.join(vocab_path, MERGES_NAME) special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) with open(vocab_file, 'w', encoding='utf-8') as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: writer.write(u'#version: 0.2\n') for bpe_tokens, token_index in sorted( self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: warnings.warn( "Saving vocabulary to {}: BPE merge indices are not consecutive." " Please check that the tokenizer is not corrupted!". format(merge_file)) index = token_index writer.write(' '.join(bpe_tokens) + u'\n') index += 1 index = len(self.encoder) with open(special_tokens_file, 'w', encoding='utf-8') as writer: for token, token_index in sorted( self.special_tokens.items(), key=lambda kv: kv[1]): if index != token_index: warnings.warn( "Saving special tokens vocabulary to {}: BPE indices are not consecutive." " Please check that the tokenizer is not corrupted!". format(special_tokens_file)) index = token_index writer.write(token + u'\n') index += 1 return vocab_file, merge_file, special_tokens_file @property def vocab_size(self): return len(self.encoder) @property def vocab(self): return self.encoder @property def inv_vocab(self): return self.decoder @property def eos_token_id(self): return self.eod_id ================================================ FILE: ppfleetx/data/tokenizers/t5_tokenization_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py """ import bisect import itertools import re import unicodedata from collections import OrderedDict from typing import Any, Dict, List, Optional, Tuple, Union, overload from .tokenization_utils_base import ( ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, INIT_TOKENIZER_DOCSTRING, AddedToken, BatchEncoding, EncodedInput, EncodedInputPair, PreTokenizedInput, PreTokenizedInputPair, PreTrainedTokenizerBase, TextInput, TextInputPair, TruncationStrategy, ) from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging logger = logging.get_logger(__name__) # Slow tokenizers are saved in a vocabulary plus three separated files SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" class Trie: """ Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass Loose reference https://en.wikipedia.org/wiki/Trie """ def __init__(self): self.data = {} def add(self, word): """ Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. The special key `""` is used to represent termination. This function is idempotent, adding twice the same word will leave the trie unchanged Example: ```python >>> trie = Trie() >>> trie.add("Hello 友達") >>> trie.data {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} >>> trie.add("Hello") >>> trie.data {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} ``` """ if not word: # Prevent empty string return ref = self.data for char in word: ref[char] = char in ref and ref[char] or {} ref = ref[char] ref[""] = 1 def split(self, text): """ Will look for the words added to the trie within `text`. Output is the original string splitted along the boundaries of the words found. This trie will match the longest possible word first ! Example: ```python >>> trie = Trie() >>> trie.split("[CLS] This is a extra_id_100") ["[CLS] This is a extra_id_100"] >>> trie.add("[CLS]") >>> trie.add("extra_id_1") >>> trie.add("extra_id_100") >>> trie.split("[CLS] This is a extra_id_100") ["[CLS]", " This is a ", "extra_id_100"] ``` """ # indexes are counted left of the chars index. # "hello", index 0, is left of h, index 1 is between h and e. # index 5 is right of the "o". # States are going to capture every possible start (indexes as above) # as keys, and have as values, a pointer to the position in the trie # where we're at. This is a partial match for now. # This enables to keep track of multiple matches while we're iterating # the string # If the trie contains, "blowing", and "lower" and we encounter the # string "blower", we need to split into ["b", "lower"]. # This is where we need to keep track of multiple possible starts. states = OrderedDict() # This will contain every indices where we need # to cut. # We force to cut at offset 0 and len(text) (added later) offsets = [0] # This is used by the lookahead which needs to skip over # some text where the full match exceeded the place in the initial # for loop skip = 0 # Main loop, Giving this algorithm O(n) complexity for current, current_char in enumerate(text): if skip and current < skip: # Prevents the lookahead for matching twice # like extra_id_100 and id_100 continue # This will track every state # that stop matching, we need to stop tracking them. # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then # fail on "b", we need to remove 0 from the valid states. to_remove = set() # Whenever we found a match, we need to drop everything # this is a greedy algorithm, it will match on the first found token reset = False # In this case, we already have partial matches (But unfinished) for start, trie_pointer in states.items(): if "" in trie_pointer: # This is a final match, we need to reset and # store the results in `offsets`. # Lookahead to match longest first # Important in case of extra_id_1 vs extra_id_100 # Here we are also actively looking for other earlier partial # matches # "[CLS]", "L", we need to match CLS even if L is special for lookstart, looktrie_pointer in states.items(): if lookstart > start: # This partial match is later, we can stop looking break elif lookstart < start: # This partial match is earlier, the trie pointer # was already updated, so index is + 1 lookahead_index = current + 1 end = current + 1 else: # Here lookstart == start and # looktrie_pointer == trie_pointer # It wasn't updated yet so indices are current ones lookahead_index = current end = current next_char = text[ lookahead_index] if lookahead_index < len( text) else None if "" in looktrie_pointer: start = lookstart end = lookahead_index skip = lookahead_index while next_char in looktrie_pointer: looktrie_pointer = looktrie_pointer[next_char] lookahead_index += 1 if "" in looktrie_pointer: start = lookstart end = lookahead_index skip = lookahead_index if lookahead_index == len(text): # End of string break next_char = text[lookahead_index] # End lookahead # Storing and resetting offsets.append(start) offsets.append(end) reset = True break elif current_char in trie_pointer: # The current character being looked at has a match within the trie # update the pointer (it will be stored back into states later). trie_pointer = trie_pointer[current_char] # Storing back the new pointer into the states. # Partial matches got longer by one. states[start] = trie_pointer else: # The new character has not match in the trie, we need # to stop keeping track of this partial match. # We can't do it directly within the loop because of how # python iteration works to_remove.add(start) # Either clearing the full start (we found a real match) # Or clearing only the partial matches that didn't work. if reset: states = {} else: for start in to_remove: del states[start] # If this character is a starting character within the trie # start keeping track of this partial match. if current >= skip and current_char in self.data: states[current] = self.data[current_char] # We have a cut at the end with states. for start, trie_pointer in states.items(): if "" in trie_pointer: # This is a final match, we need to reset and # store the results in `offsets`. end = len(text) offsets.append(start) offsets.append(end) # Longest cut is always the one with lower start so the first # item so we need to break. break return self.cut_text(text, offsets) def cut_text(self, text, offsets): # We have all the offsets now, we just need to do the actual splitting. # We need to eventually add the first part of the string and the eventual # last part. offsets.append(len(text)) tokens = [] start = 0 for end in offsets: if start > end: logger.error( "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" " anyway.") continue elif start == end: # This might happen if there's a match at index 0 # we're also preventing zero-width cuts in case of two # consecutive matches continue tokens.append(text[start:end]) start = end return tokens def _is_whitespace(char): """Checks whether `char` is a whitespace character.""" # \t, \n, and \r are technically control characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True cat = unicodedata.category(char) if cat == "Zs": return True return False def _is_control(char): """Checks whether `char` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": return False cat = unicodedata.category(char) if cat.startswith("C"): return True return False def _is_punctuation(char): """Checks whether `char` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): return True cat = unicodedata.category(char) if cat.startswith("P"): return True return False def _is_end_of_word(text): """Checks whether the last character in text is one of a punctuation, control or whitespace character.""" last_char = text[-1] return bool( _is_control(last_char) | _is_punctuation(last_char) | _is_whitespace( last_char)) def _is_start_of_word(text): """Checks whether the first character in text is one of a punctuation, control or whitespace character.""" first_char = text[0] return bool( _is_control(first_char) | _is_punctuation(first_char) | _is_whitespace( first_char)) def _insert_one_token_to_ordered_list(token_list, new_token): """ Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. """ insertion_idx = bisect.bisect_left(token_list, new_token) # Checks if new_token is already in the ordered token_list if insertion_idx < len(token_list) and token_list[ insertion_idx] == new_token: # new_token is in token_list, don't add return else: token_list.insert(insertion_idx, new_token) @add_end_docstrings(INIT_TOKENIZER_DOCSTRING) class PreTrainedTokenizer(PreTrainedTokenizerBase): """ Base class for all slow tokenizers. Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`]. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). """ def __init__(self, **kwargs): super().__init__(**kwargs) # Added tokens - We store this for both slow and fast tokenizers # until the serialization of Fast tokenizers is updated self.added_tokens_encoder = {} self.added_tokens_decoder = {} self.unique_no_split_tokens = [] self.tokens_trie = Trie() self._decode_use_source_tokenizer = False @property def is_fast(self): return False @property def vocab_size(self): """ `int`: Size of the base vocabulary (without the added tokens). """ raise NotImplementedError def get_added_vocab(self): """ Returns the added tokens in the vocabulary as a dictionary of token to index. Returns: `Dict[str, int]`: The added tokens. """ return self.added_tokens_encoder def __len__(self): """ Size of the full vocabulary with the added tokens. """ return self.vocab_size + len(self.added_tokens_encoder) def _add_tokens(self, new_tokens, special_tokens=False): """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: new_tokens (`List[str]`or `List[tokenizers.AddedToken]`): Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the tokens should be added as special tokens. Returns: `int`: The number of tokens actually added to the vocabulary. Examples: ```python # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertModel.from_pretrained("bert-base-uncased") num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) print("We have added", num_added_toks, "tokens") # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) ```""" new_tokens = [str(tok) for tok in new_tokens] tokens_to_add = [] for token in new_tokens: if not isinstance(token, str): raise TypeError( "Token {token} is not a string but a {type(token)}.") if not special_tokens and hasattr( self, "do_lower_case") and self.do_lower_case: token = token.lower() if (token != self.unk_token and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and token not in tokens_to_add): tokens_to_add.append(token) #if self.verbose: # logger.info(f"Adding {token} to the vocabulary") added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) self.added_tokens_decoder.update(added_tok_decoder) # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: if len(new_tokens) == 1: _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0]) else: self.unique_no_split_tokens = sorted( set(self.unique_no_split_tokens).union(set(new_tokens))) else: # Or on the newly added tokens if len(tokens_to_add) == 1: _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0]) else: self.unique_no_split_tokens = sorted( set(self.unique_no_split_tokens).union( set(tokens_to_add))) self._create_trie(self.unique_no_split_tokens) return len(tokens_to_add) def _create_trie(self, unique_no_split_tokens): trie = Trie() for token in unique_no_split_tokens: if hasattr( self, "do_lower_case" ) and self.do_lower_case and token not in self.all_special_tokens: trie.add(token.lower()) else: trie.add(token) self.tokens_trie = trie def num_special_tokens_to_add(self, pair): """ Returns the number of added tokens when encoding a sequence with special tokens. This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair (`bool`, *optional*, defaults to `False`): Whether the number of added tokens should be computed in the case of a sequence pair or a single sequence. Returns: `int`: Number of special tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] return len( self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens, using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Takes care of added tokens. Args: text (`str`): The sequence to be encoded. **kwargs (additional keyword arguments): Passed along to the model-specific `prepare_for_tokenization` preprocessing method. Returns: `List[str]`: The list of tokens. """ # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors all_special_tokens_extended = dict( (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)) text, kwargs = self.prepare_for_tokenization(text, **kwargs) if kwargs: logger.warning("Keyword arguments {kwargs} not recognized.") # TODO: should this be in the base class? if hasattr(self, "do_lower_case") and self.do_lower_case: # convert non-special tokens to lowercase escaped_special_toks = [ re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) ] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) no_split_token = set(self.unique_no_split_tokens) tokens = self.tokens_trie.split(text) # ["This is something", "", " else"] for i, token in enumerate(tokens): if token in no_split_token: tok_extended = all_special_tokens_extended.get(token, None) left = tokens[i - 1] if i > 0 else None right = tokens[i + 1] if i < len(tokens) - 1 else None if isinstance(tok_extended, AddedToken): if tok_extended.rstrip and right: # A bit counter-intuitive but we strip the left of the string # since tok_extended.rstrip means the special token is eating all white spaces on its right tokens[i + 1] = right.lstrip() # Strip white spaces on the left if tok_extended.lstrip and left: tokens[i - 1] = left.rstrip() # Opposite here else: # We strip left and right by default if right: tokens[i + 1] = right.lstrip() if left: tokens[i - 1] = left.rstrip() # ["This is something", "", "else"] tokenized_text = [] for token in tokens: # Need to skip eventual empty (fully stripped) tokens if not token: continue if token in no_split_token: tokenized_text.append(token) else: tokenized_text.extend(self._tokenize(token)) # ["This", " is", " something", "", "else"] return tokenized_text def _tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Do NOT take care of added tokens. """ raise NotImplementedError def convert_tokens_to_ids(self, tokens): """ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. """ if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) ids = [] for token in tokens: ids.append(self._convert_token_to_id_with_added_voc(token)) return ids def _convert_token_to_id_with_added_voc(self, token): if token is None: return None if token in self.added_tokens_encoder: return self.added_tokens_encoder[token] return self._convert_token_to_id(token) def _convert_token_to_id(self, token): raise NotImplementedError def _encode_plus(self, text, text_pair=None, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs): def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): if is_split_into_words: tokens = list( itertools.chain(*(self.tokenize( t, is_split_into_words=True, **kwargs) for t in text))) return self.convert_tokens_to_ids(tokens) else: return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], int): return text else: if is_split_into_words: raise ValueError( "Input {text} is not valid. Should be a string or a list/tuple of strings when" " `is_split_into_words=True`.") else: raise ValueError( "Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of" " integers.") if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " "To use this feature, change your tokenizer to one deriving from " "transformers.PreTrainedTokenizerFast. " "More information on available tokenizers at " "https://github.com/huggingface/transformers/pull/2674") first_ids = get_input_ids(text) second_ids = get_input_ids( text_pair) if text_pair is not None else None return self.prepare_for_model( first_ids, pair_ids=second_ids, add_special_tokens=add_special_tokens, padding=padding_strategy.value, truncation=truncation_strategy.value, max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, verbose=verbose, ) def _batch_encode_plus( self, batch_text_or_text_pairs, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs): def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): if is_split_into_words: tokens = list( itertools.chain(*(self.tokenize( t, is_split_into_words=True, **kwargs) for t in text))) return self.convert_tokens_to_ids(tokens) else: return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " "To use this feature, change your tokenizer to one deriving from " "transformers.PreTrainedTokenizerFast.") input_ids = [] for ids_or_pair_ids in batch_text_or_text_pairs: if not isinstance(ids_or_pair_ids, (list, tuple)): ids, pair_ids = ids_or_pair_ids, None elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): ids, pair_ids = ids_or_pair_ids, None else: ids, pair_ids = ids_or_pair_ids first_ids = get_input_ids(ids) second_ids = get_input_ids( pair_ids) if pair_ids is not None else None input_ids.append((first_ids, second_ids)) batch_outputs = self._batch_prepare_for_model( input_ids, add_special_tokens=add_special_tokens, padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=return_tensors, verbose=verbose, ) return BatchEncoding(batch_outputs) @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) def _batch_prepare_for_model( self, batch_ids_pairs, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_length=False, verbose=True, ): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: batch_ids_pairs: list of tokenized input ids or input ids pairs """ batch_outputs = {} for first_ids, second_ids in batch_ids_pairs: outputs = self.prepare_for_model( first_ids, second_ids, add_special_tokens=add_special_tokens, padding=PaddingStrategy.DO_NOT_PAD. value, # we pad in batch afterward truncation=truncation_strategy.value, max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=None, # We convert the whole batch to tensors at the end prepend_batch_axis=False, verbose=verbose, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) batch_outputs = self.pad( batch_outputs, padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) batch_outputs = BatchEncoding( batch_outputs, tensor_type=return_tensors) return batch_outputs def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): """ Performs any necessary transformations before tokenization. This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the `kwargs` at the end of the encoding process to be sure all the arguments have been used. Args: text (`str`): The text to prepare. is_split_into_words (`bool`, *optional*, defaults to `False`): Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. kwargs: Keyword arguments to use for the tokenization. Returns: `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. """ return (text, kwargs) def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. Args: token_ids_0 (`List[int]`): List of ids of the first sequence. token_ids_1 (`List[int]`, *optional*): List of ids of the second sequence. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True) return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) @overload def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool=False) -> str: ... @overload def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool=False) -> List[str]: ... def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool=False) -> Union[str, List[str]]: """ Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. Args: ids (`int` or `List[int]`): The token id (or token ids) to convert to tokens. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. Returns: `str` or `List[str]`: The decoded token(s). """ if isinstance(ids, int): if ids in self.added_tokens_decoder: return self.added_tokens_decoder[ids] else: return self._convert_id_to_token(ids) tokens = [] for index in ids: index = int(index) if skip_special_tokens and index in self.all_special_ids: continue if index in self.added_tokens_decoder: tokens.append(self.added_tokens_decoder[index]) else: tokens.append(self._convert_id_to_token(index)) return tokens def _convert_id_to_token(self, index: int) -> str: raise NotImplementedError def convert_tokens_to_string(self, tokens: List[str]) -> str: return " ".join(tokens) def _decode(self, token_ids: List[int], skip_special_tokens: bool=False, clean_up_tokenization_spaces: bool=True, spaces_between_special_tokens: bool=True, **kwargs) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) filtered_tokens = self.convert_ids_to_tokens( token_ids, skip_special_tokens=skip_special_tokens) # To avoid mixing byte-level and unicode for byte-level BPT # we need to build string separately for added tokens and byte-level tokens # cf. https://github.com/huggingface/transformers/issues/1133 sub_texts = [] current_sub_text = [] for token in filtered_tokens: if skip_special_tokens and token in self.all_special_ids: continue if token in self.added_tokens_encoder: if current_sub_text: sub_texts.append( self.convert_tokens_to_string(current_sub_text)) current_sub_text = [] sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) if spaces_between_special_tokens: text = " ".join(sub_texts) else: text = "".join(sub_texts) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text else: return text ================================================ FILE: ppfleetx/data/tokenizers/t5_tokenizer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Google T5.""" from __future__ import (absolute_import, division, print_function, unicode_literals) import copy import sys import json import logging import warnings import os import regex as re from io import open from collections import OrderedDict from typing import Any, Dict, List, Optional, Tuple, Union, overload import sentencepiece as spm from ppfleetx.utils.download import cached_path from ppfleetx.data.tokenizers.tokenization_utils_base import ( _LazyConfigMapping, AddedToken, TruncationStrategy, PaddingStrategy, BatchEncoding, SpecialTokensMixin) try: from functools import lru_cache except ImportError: # Just a dummy decorator to get the checks to run on python2 # because honestly I don't want to support a byte-level unicode BPE # tokenizer on python 2 right now. def lru_cache(): return lambda func: func from ppfleetx.utils.log import logger MAX_LENGTH = 256 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} MODEL_FILES_NAMES = {"config_file": "config.json"} CONFIG_MAPPING_NAMES = OrderedDict([("t5", "T5Config")]) CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { 't5-11b': "https://fleet.bj.bcebos.com/datasets/t5/spiece.model", } } PRETRAINED_MERGES_ARCHIVE_MAP = { 't5-11b': "https://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt", } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "t5-small": 512, "t5-base": 512, "t5-large": 512, "t5-3b": 512, "t5-11b": 512, } # Slow tokenizers used to be saved in three separated files DEFAULT_T5_NAME = "projects/imagen/t5/t5-11b" SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file FULL_TOKENIZER_FILE = "tokenizer.json" _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json") def get_t5_tokenizer(name=DEFAULT_T5_NAME): tokenizer = T5Tokenizer.from_pretrained(name) return tokenizer def t5_tokenize(texts, tokenizer): encoded = tokenizer.batch_encode_plus( texts, return_tensors="paddle", padding='longest', max_length=MAX_LENGTH, truncation=True) input_ids = encoded.input_ids attn_mask = encoded.attention_mask return input_ids, attn_mask class T5Tokenizer(SpecialTokensMixin): """ T5 tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES config_files_names = MODEL_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] slow_tokenizer_class = None padding_side = "right" truncation_side = "right" def __init__(self, vocab_file, eos_token="", unk_token="", pad_token="", extra_ids=100, additional_special_tokens=None, sp_model_kwargs=None, **kwargs): # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: additional_special_tokens = [ f"" for i in range(extra_ids) ] elif extra_ids > 0 and additional_special_tokens is not None: # Check that we have the right number of extra_id special tokens extra_tokens = len( set( filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))) if extra_tokens != extra_ids: raise ValueError( f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are" " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids" " tokens") self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, extra_ids=extra_ids, additional_special_tokens=additional_special_tokens, sp_model_kwargs=self.sp_model_kwargs, **kwargs) self.vocab_file = vocab_file self._extra_ids = extra_ids self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) self.deprecation_warnings = ({}) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) use_auth_token = kwargs.pop("use_auth_token", None) revision = kwargs.pop("revision", None) subfolder = kwargs.pop("subfolder", None) pretrained_model_name_or_path = str(pretrained_model_name_or_path) vocab_files = {} init_configuration = {} if os.path.isfile(pretrained_model_name_or_path): if len(cls.vocab_files_names) > 1: raise ValueError( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not " "supported for this tokenizer. Use a model identifier or the path to a directory instead." ) warnings.warn( f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and " "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.", FutureWarning, ) file_id = list(cls.vocab_files_names.keys())[0] vocab_files[file_id] = pretrained_model_name_or_path else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, } vocab_files_target = { ** cls.vocab_files_names, ** cls.config_files_names, ** additional_files_names } if "tokenizer_file" in vocab_files_target: # Try to get the tokenizer config to see if there are versioned tokenizer files. fast_tokenizer_file = FULL_TOKENIZER_FILE resolved_config_file = get_file_from_repo( pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, proxies=proxies, use_auth_token=use_auth_token, revision=revision, local_files_only=local_files_only, ) if resolved_config_file is not None: with open( resolved_config_file, encoding="utf-8") as reader: tokenizer_config = json.load(reader) if "fast_tokenizer_files" in tokenizer_config: fast_tokenizer_file = get_fast_tokenizer_file( tokenizer_config["fast_tokenizer_files"]) vocab_files_target["tokenizer_file"] = fast_tokenizer_file # Look for the tokenizer files for file_id, file_name in vocab_files_target.items(): if os.path.isdir(pretrained_model_name_or_path): if subfolder is not None: full_file_name = os.path.join( pretrained_model_name_or_path, subfolder, file_name) else: full_file_name = os.path.join( pretrained_model_name_or_path, file_name) if not os.path.exists(full_file_name): #logger.info("Didn't find file {full_file_name}. We won't load it.") full_file_name = None vocab_files[file_id] = full_file_name # Get files from url, cache, or disk depending on the case resolved_vocab_files = {} unresolved_files = [] for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None else: try: resolved_vocab_files[file_id] = cached_path( file_path, cache_dir=cache_dir, ) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find files {} and {} " "at this path or url.".format( pretrained_model_name_or_path, ', '.join( PRETRAINED_VOCAB_ARCHIVE_MAP.keys( )), pretrained_model_name_or_path, vocab_file, merges_file)) return None if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): raise EnvironmentError( f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from " "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " f"containing all relevant files for a {cls.__name__} tokenizer." ) for file_id, file_path in vocab_files.items(): if file_id not in resolved_vocab_files: continue return cls._from_pretrained( resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, use_auth_token=use_auth_token, cache_dir=cache_dir, **kwargs, ) @classmethod def _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, use_auth_token=None, cache_dir=None, **kwargs): # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json # file or if `from_slow` is set to True. from_slow = kwargs.get("from_slow", False) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None if (from_slow or not has_tokenizer_file ) and cls.slow_tokenizer_class is not None: slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( copy.deepcopy(resolved_vocab_files), pretrained_model_name_or_path, copy.deepcopy(init_configuration), *init_inputs, **(copy.deepcopy(kwargs)), ) else: slow_tokenizer = None # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? tokenizer_config_file = resolved_vocab_files.pop( "tokenizer_config_file", None) if tokenizer_config_file is not None: with open( tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: init_kwargs = json.load(tokenizer_config_handle) # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers. config_tokenizer_class = init_kwargs.get("tokenizer_class") init_kwargs.pop("tokenizer_class", None) init_kwargs.pop("auto_map", None) saved_init_inputs = init_kwargs.pop("init_inputs", ()) if not init_inputs: init_inputs = saved_init_inputs else: config_tokenizer_class = None init_kwargs = init_configuration if config_tokenizer_class is None: # Second attempt. If we have not yet found tokenizer_class, let's try to use the config. try: config_dict = resolved_vocab_files.pop("config_file", None) config_dict = cls._dict_from_json_file(config_dict) config_tokenizer_class = config_dict[ "tokenizer_class"] if "tokenizer_class" in config_dict else None except (OSError, ValueError, KeyError): # skip if an error occurred. config_dict = None if config_tokenizer_class is None: # Third attempt. If we have not yet found the original type of the tokenizer, # we are loading we see if we can infer it from the type of the configuration file from ppfleetx.data.tokenizers.tokenization_utils_base import TOKENIZER_MAPPING_NAMES # tests_ignore model_type = config_dict[ "model_type"] if "model_type" in config_dict else None if model_type is None: # Fallback: use pattern matching on the string. model_type = None for pattern in TOKENIZER_MAPPING_NAMES.keys(): if pattern in str(pretrained_model_name_or_path): model_type = pattern break if model_type is not None: config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get( model_type, (None, None)) if config_tokenizer_class is None: config_tokenizer_class = config_tokenizer_class_fast if config_tokenizer_class is not None: if cls.__name__.replace( "Fast", "") != config_tokenizer_class.replace("Fast", ""): logger.warning( "The tokenizer class you load from this checkpoint is not the same type as the class this" " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you" f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called" f" from is '{cls.__name__}'.") # Update with newly provided kwargs init_kwargs.update(kwargs) # Convert AddedTokens serialized as dict to class instances def convert_added_tokens(obj): if isinstance(obj, dict) and "__type" in obj and obj[ "__type"] == "AddedToken": obj.pop("__type") return AddedToken(**obj) elif isinstance(obj, (list, tuple)): return list(convert_added_tokens(o) for o in obj) elif isinstance(obj, dict): return {k: convert_added_tokens(v) for k, v in obj.items()} return obj init_kwargs = convert_added_tokens(init_kwargs) # Set max length if needed if pretrained_model_name_or_path in cls.max_model_input_sizes: # if we're using a pretrained model, ensure the tokenizer # wont index sequences longer than the number of positional embeddings model_max_length = cls.max_model_input_sizes[ pretrained_model_name_or_path] if model_max_length is not None and isinstance(model_max_length, (int, float)): model_max_length = min( init_kwargs.get("model_max_length", int(1e30)), model_max_length) # TODO(PVP) - uncomment following line in Transformers v5 # init_kwargs["model_max_length"] = model_max_length # TODO(PVP) - remove in Transformers v5 # --- init_kwargs[ "model_max_length"] = cls._eventually_correct_t5_max_length( pretrained_model_name_or_path, model_max_length, init_kwargs.get("model_max_length")) # --- # Merge resolved_vocab_files arguments in init_kwargs. added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path if slow_tokenizer is not None: init_kwargs["__slow_tokenizer"] = slow_tokenizer init_kwargs["name_or_path"] = pretrained_model_name_or_path # Instantiate tokenizer. try: tokenizer = cls(**init_kwargs) except OSError: raise OSError( "Unable to load vocabulary from file. " "Please check that the provided vocabulary is accessible and not corrupted." ) # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` # Removed: Now done at the base class level # tokenizer.init_inputs = init_inputs # tokenizer.init_kwargs = init_kwargs # If there is a complementary special token map, load it special_tokens_map_file = resolved_vocab_files.pop( "special_tokens_map_file", None) # Add supplementary tokens. special_tokens = tokenizer.all_special_tokens # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab added_tokens = tokenizer.sanitize_special_tokens() if added_tokens: logger.warning_advice( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained.") return tokenizer def _eventual_warn_about_too_long_sequence(self, ids, max_length, verbose: bool): """ Depending on the input and internal state we might trigger a warning about a sequence that is too long for its corresponding model Args: ids (`List[str]`): The ids produced by the tokenization max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set) verbose (`bool`): Whether or not to print more information and warnings. """ if max_length is None and len(ids) > self.model_max_length and verbose: if not self.deprecation_warnings.get( "sequence-length-is-longer-than-the-specified-maximum", False): logger.warning( "Token indices sequence length is longer than the specified maximum sequence length " f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model " "will result in indexing errors") self.deprecation_warnings[ "sequence-length-is-longer-than-the-specified-maximum"] = True def _get_padding_truncation_strategies(self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs): """ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy and pad_to_max_length) and behaviors. """ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) # Backward compatibility for previous behavior, maybe we should deprecate it: # If you only set max_length, it activates truncation for max_length if max_length is not None and padding is False and truncation is False: if verbose: if not self.deprecation_warnings.get( "Truncation-not-explicitly-activated", False): logger.warning( "Truncation was not explicitly activated but `max_length` is provided a specific value, please" " use `truncation=True` to explicitly truncate examples to max length. Defaulting to" " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" " tokenizer you can select this strategy more precisely by providing a specific strategy to" " `truncation`.") self.deprecation_warnings[ "Truncation-not-explicitly-activated"] = True truncation = "longest_first" # Get padding strategy if padding is False and old_pad_to_max_length: if verbose: warnings.warn( "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " "maximal input size of the model (e.g. 512 for Bert).", FutureWarning, ) if max_length is None: padding_strategy = PaddingStrategy.LONGEST else: padding_strategy = PaddingStrategy.MAX_LENGTH elif padding is not False: if padding is True: if verbose: if max_length is not None and ( truncation is False or truncation == "do_not_truncate"): warnings.warn( "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " "To pad to max length, use `padding='max_length'`.") if old_pad_to_max_length is not False: warnings.warn( "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`." ) padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch elif not isinstance(padding, PaddingStrategy): padding_strategy = PaddingStrategy(padding) elif isinstance(padding, PaddingStrategy): padding_strategy = padding else: padding_strategy = PaddingStrategy.DO_NOT_PAD # Get truncation strategy if truncation is False and old_truncation_strategy != "do_not_truncate": if verbose: warnings.warn( "The `truncation_strategy` argument is deprecated and will be removed in a future version, use" " `truncation=True` to truncate examples to a max length. You can give a specific length with" " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input" " size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific" " truncation strategy selected among `truncation='only_first'` (will only truncate the first" " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the" " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence" " in the pairs).", FutureWarning, ) truncation_strategy = TruncationStrategy(old_truncation_strategy) elif truncation is not False: if truncation is True: truncation_strategy = ( TruncationStrategy.LONGEST_FIRST ) # Default to truncate the longest sequences in pairs of inputs elif not isinstance(truncation, TruncationStrategy): truncation_strategy = TruncationStrategy(truncation) elif isinstance(truncation, TruncationStrategy): truncation_strategy = truncation else: truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE # Set max length if needed if max_length is None: if padding_strategy == PaddingStrategy.MAX_LENGTH: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-pad-to-max_length", False): logger.warning( "Asking to pad to max_length but no maximum length is provided and the model has no" " predefined maximum length. Default to no padding." ) self.deprecation_warnings[ "Asking-to-pad-to-max_length"] = True padding_strategy = PaddingStrategy.DO_NOT_PAD else: max_length = self.model_max_length if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-truncate-to-max_length", False): logger.warning( "Asking to truncate to max_length but no maximum length is provided and the model has" " no predefined maximum length. Default to no truncation." ) self.deprecation_warnings[ "Asking-to-truncate-to-max_length"] = True truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE else: max_length = self.model_max_length # Test if we have a padding token if padding_strategy != PaddingStrategy.DO_NOT_PAD and ( not self.pad_token or self.pad_token_id < 0): raise ValueError( "Asking to pad but the tokenizer does not have a padding token. " "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." ) # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and padding_strategy != PaddingStrategy.DO_NOT_PAD and pad_to_multiple_of is not None and max_length is not None and (max_length % pad_to_multiple_of != 0)): raise ValueError( "Truncation and padding are both activated but " f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." ) return padding_strategy, truncation_strategy, max_length, kwargs def _pad(self, encoded_inputs, max_length=None, padding_strategy=PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of=None, return_attention_mask=None): """ Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). max_length: maximum length of the returned list and optionally padding length (see below). Will truncate by taking into account the special tokens. padding_strategy: PaddingStrategy to use for padding. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad The tokenizer padding sides are defined in self.padding_side: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability >= 7.5 (Volta). return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names required_input = encoded_inputs[self.model_input_names[0]] if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and ( max_length % pad_to_multiple_of != 0): max_length = ( (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len( required_input) != max_length # Initialize attention mask if not present. if return_attention_mask and "attention_mask" not in encoded_inputs: encoded_inputs["attention_mask"] = [1] * len(required_input) if needs_to_be_padded: difference = max_length - len(required_input) if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs[ "attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference) if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs[ "special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[ 0]] = required_input + [self.pad_token_id] * difference elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [ 0 ] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = [ self.pad_token_type_id ] * difference + encoded_inputs["token_type_ids"] if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = [ 1 ] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[ 0]] = [self.pad_token_id] * difference + required_input else: raise ValueError("Invalid padding strategy:" + str( self.padding_side)) return encoded_inputs def pad( self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_attention_mask=None, return_tensors=None, verbose=True, ): """ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, `self.pad_token_id` and `self.pad_token_type_id`) If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of PyTorch tensors, you will lose the specific device of your tensors however. Args: encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention masks?](../glossary#attention-mask) return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - `'tf'`: Return TensorFlow `tf.constant` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects. - `'np'`: Return Numpy `np.ndarray` objects. verbose (`bool`, *optional*, defaults to `True`): Whether or not to print more information and warnings. """ # If we have a list of dicts, let's convert it in a dict of lists # We do this to allow using this method as a collate_fn function in PyTorch Dataloader if isinstance(encoded_inputs, (list, tuple)) and isinstance( encoded_inputs[0], Mapping): encoded_inputs = { key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys() } # The model's main input name, usually `input_ids`, has be passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method " f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" ) required_input = encoded_inputs[self.model_input_names[0]] if not required_input: if return_attention_mask: encoded_inputs["attention_mask"] = [] return encoded_inputs # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects # and rebuild them afterwards if no return_tensors is specified # Note that we lose the specific device the tensor may be on for PyTorch first_element = required_input[0] if isinstance(first_element, (list, tuple)): # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. for item in required_input: if len(item) != 0: first_element = item[0] break # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. if not isinstance(first_element, (int, list, tuple)): if is_tf_available() and _is_tensorflow(first_element): return_tensors = "tf" if return_tensors is None else return_tensors elif is_torch_available() and _is_torch(first_element): return_tensors = "pt" if return_tensors is None else return_tensors elif isinstance(first_element, np.ndarray): return_tensors = "np" if return_tensors is None else return_tensors else: raise ValueError( f"type of {first_element} unknown: {type(first_element)}. " "Should be one of a python, numpy, pytorch or tensorflow object." ) for key, value in encoded_inputs.items(): encoded_inputs[key] = to_py_obj(value) # Convert padding_strategy in PaddingStrategy padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( padding=padding, max_length=max_length, verbose=verbose) required_input = encoded_inputs[self.model_input_names[0]] if required_input and not isinstance(required_input[0], (list, tuple)): encoded_inputs = self._pad( encoded_inputs, max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) batch_size = len(required_input) assert all( len(v) == batch_size for v in encoded_inputs.values() ), "Some items in the output dictionary have a different batch size than others." if padding_strategy == PaddingStrategy.LONGEST: max_length = max(len(inputs) for inputs in required_input) padding_strategy = PaddingStrategy.MAX_LENGTH batch_outputs = {} for i in range(batch_size): inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) outputs = self._pad( inputs, max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) return BatchEncoding(batch_outputs, tensor_type=return_tensors) def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]` of zeros. """ eos = [self.eos_token_id] if token_ids_1 is None: return len(token_ids_0 + eos) * [0] return len(token_ids_0 + eos + token_ids_1 + eos) * [0] def _add_eos_if_not_present(self, token_ids): """Do not add eos again if user already added it.""" if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: warnings.warn( f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated" " eos tokens being added.") return token_ids else: return token_ids + [self.eos_token_id] def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A sequence has the following format: - single sequence: `X ` - pair of sequences: `A B ` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]` of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ token_ids_0 = self._add_eos_if_not_present(token_ids_0) if token_ids_1 is None: return token_ids_0 else: token_ids_1 = self._add_eos_if_not_present(token_ids_1) return token_ids_0 + token_ids_1 def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0): """ Truncates a sequence pair in-place following the strategy. Args: ids (`List[int]`): Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids (`List[int]`, *optional*): Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. num_tokens_to_remove (`int`, *optional*, defaults to 0): Number of tokens to remove using the truncation strategy. truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): The strategy to follow for truncation. Can be: - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided. - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size). stride (`int`, *optional*, defaults to 0): If set to a positive number, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. Returns: `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair of sequences (or a batch of pairs) is provided. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] if not isinstance(truncation_strategy, TruncationStrategy): truncation_strategy = TruncationStrategy(truncation_strategy) overflowing_tokens = [] if truncation_strategy == TruncationStrategy.ONLY_FIRST or ( truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None): if len(ids) > num_tokens_to_remove: window_len = min(len(ids), stride + num_tokens_to_remove) if self.truncation_side == "left": overflowing_tokens = ids[:window_len] ids = ids[num_tokens_to_remove:] elif self.truncation_side == "right": overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] else: raise ValueError( f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'." ) else: error_msg = ( f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the first sequence has a length {len(ids)}. ") if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) elif truncation_strategy == TruncationStrategy.LONGEST_FIRST: logger.warning( "Be aware, overflowing tokens are not returned for the setting you have chosen," f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' " "truncation strategy. So the returned list will always be empty even if some " "tokens have been removed.") for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): if self.truncation_side == "right": ids = ids[:-1] elif self.truncation_side == "left": ids = ids[1:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) else: if self.truncation_side == "right": pair_ids = pair_ids[:-1] elif self.truncation_side == "left": pair_ids = pair_ids[1:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: if len(pair_ids) > num_tokens_to_remove: window_len = min(len(pair_ids), stride + num_tokens_to_remove) if self.truncation_side == "right": overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif self.truncation_side == "left": overflowing_tokens = pair_ids[:window_len] pair_ids = pair_ids[num_tokens_to_remove:] else: raise ValueError("invalid truncation strategy:" + str( self.truncation_side)) else: logger.error( f"We need to remove {num_tokens_to_remove} to truncate the input " f"but the second sequence has a length {len(pair_ids)}. " f"Please select another truncation strategy than {truncation_strategy}, " "for instance 'longest_first' or 'only_first'.") return (ids, pair_ids, overflowing_tokens) def prepare_for_model(self, ids, pair_ids=None, add_special_tokens=True, padding=False, truncation=False, max_length=None, stride=0, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, prepend_batch_axis=False, **kwargs): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids* different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an error. Args: ids (`List[int]`): Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids (`List[int]`, *optional*): Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. """ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, verbose=verbose, **kwargs, ) pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 if return_token_type_ids and not add_special_tokens: raise ValueError( "Asking to return token_type_ids while setting add_special_tokens to False " "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None.") if (return_overflowing_tokens and truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is not None): raise ValueError( "Not possible to return overflowing tokens for pair of sequences with the " "`longest_first`. Please select another truncation strategy than `longest_first`, " "for instance `only_second` or `only_first`.") # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( pair=pair) if add_special_tokens else 0) # Truncation: Handle max sequence length overflowing_tokens = [] if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation_strategy, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences( ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) # Build output dictionary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs[ "special_tokens_mask"] = self.get_special_tokens_mask( ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Check lengths self._eventual_warn_about_too_long_sequence( encoded_inputs["input_ids"], max_length, verbose) # Padding if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) batch_outputs = BatchEncoding( encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis) return batch_outputs def _batch_prepare_for_model( self, batch_ids_pairs, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_length=False, verbose=True, ): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: batch_ids_pairs: list of tokenized input ids or input ids pairs """ batch_outputs = {} for first_ids, second_ids in batch_ids_pairs: outputs = self.prepare_for_model( first_ids, second_ids, add_special_tokens=add_special_tokens, padding=PaddingStrategy.DO_NOT_PAD. value, # we pad in batch afterward truncation=truncation_strategy.value, max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=None, # We convert the whole batch to tensors at the end prepend_batch_axis=False, verbose=verbose, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) batch_outputs = self.pad( batch_outputs, padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) batch_outputs = BatchEncoding( batch_outputs, tensor_type=return_tensors) return batch_outputs def _get_padding_truncation_strategies(self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs): """ Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy and pad_to_max_length) and behaviors. """ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) # Backward compatibility for previous behavior, maybe we should deprecate it: # If you only set max_length, it activates truncation for max_length if max_length is not None and padding is False and truncation is False: if verbose: if not self.deprecation_warnings.get( "Truncation-not-explicitly-activated", False): logger.warning( "Truncation was not explicitly activated but `max_length` is provided a specific value, please" " use `truncation=True` to explicitly truncate examples to max length. Defaulting to" " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" " tokenizer you can select this strategy more precisely by providing a specific strategy to" " `truncation`.") self.deprecation_warnings[ "Truncation-not-explicitly-activated"] = True truncation = "longest_first" # Get padding strategy if padding is False and old_pad_to_max_length: if verbose: warnings.warn( "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " "maximal input size of the model (e.g. 512 for Bert).", FutureWarning, ) if max_length is None: padding_strategy = PaddingStrategy.LONGEST else: padding_strategy = PaddingStrategy.MAX_LENGTH elif padding is not False: if padding is True: if verbose: if max_length is not None and ( truncation is False or truncation == "do_not_truncate"): warnings.warn( "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " "To pad to max length, use `padding='max_length'`.") if old_pad_to_max_length is not False: warnings.warn( "Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`." ) padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch elif not isinstance(padding, PaddingStrategy): padding_strategy = PaddingStrategy(padding) elif isinstance(padding, PaddingStrategy): padding_strategy = padding else: padding_strategy = PaddingStrategy.DO_NOT_PAD # Get truncation strategy if truncation is False and old_truncation_strategy != "do_not_truncate": if verbose: warnings.warn( "The `truncation_strategy` argument is deprecated and will be removed in a future version, use" " `truncation=True` to truncate examples to a max length. You can give a specific length with" " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input" " size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific" " truncation strategy selected among `truncation='only_first'` (will only truncate the first" " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the" " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence" " in the pairs).", FutureWarning, ) truncation_strategy = TruncationStrategy(old_truncation_strategy) elif truncation is not False: if truncation is True: truncation_strategy = ( TruncationStrategy.LONGEST_FIRST ) # Default to truncate the longest sequences in pairs of inputs elif not isinstance(truncation, TruncationStrategy): truncation_strategy = TruncationStrategy(truncation) elif isinstance(truncation, TruncationStrategy): truncation_strategy = truncation else: truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE # Set max length if needed if max_length is None: if padding_strategy == PaddingStrategy.MAX_LENGTH: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-pad-to-max_length", False): logger.warning( "Asking to pad to max_length but no maximum length is provided and the model has no" " predefined maximum length. Default to no padding." ) self.deprecation_warnings[ "Asking-to-pad-to-max_length"] = True padding_strategy = PaddingStrategy.DO_NOT_PAD else: max_length = self.model_max_length if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: if self.model_max_length > LARGE_INTEGER: if verbose: if not self.deprecation_warnings.get( "Asking-to-truncate-to-max_length", False): logger.warning( "Asking to truncate to max_length but no maximum length is provided and the model has" " no predefined maximum length. Default to no truncation." ) self.deprecation_warnings[ "Asking-to-truncate-to-max_length"] = True truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE else: max_length = self.model_max_length # Test if we have a padding token if padding_strategy != PaddingStrategy.DO_NOT_PAD and ( not self.pad_token or self.pad_token_id < 0): raise ValueError( "Asking to pad but the tokenizer does not have a padding token. " "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." ) # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided if (truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and padding_strategy != PaddingStrategy.DO_NOT_PAD and pad_to_multiple_of is not None and max_length is not None and (max_length % pad_to_multiple_of != 0)): raise ValueError( "Truncation and padding are both activated but " f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." ) return padding_strategy, truncation_strategy, max_length, kwargs def batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens=True, padding=False, truncation=False, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs): """ Tokenize and prepare for the model a list of sequences or a list of pairs of sequences. This method is deprecated, `__call__` should be used instead. Args: batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`): Batch of sequences or pair of sequences to be encoded. This can be a list of string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see details in `encode_plus`). """ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, verbose=verbose, **kwargs, ) return self._batch_encode_plus( batch_text_or_text_pairs=batch_text_or_text_pairs, add_special_tokens=add_special_tokens, padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, return_length=return_length, verbose=verbose, **kwargs, ) def _batch_encode_plus( self, batch_text_or_text_pairs, add_special_tokens=True, padding_strategy=PaddingStrategy.DO_NOT_PAD, truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs): def get_input_ids(text): if isinstance(text, str): tokens = self.tokenize(text, **kwargs) return self.convert_tokens_to_ids(tokens) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): if is_split_into_words: tokens = list( itertools.chain(*(self.tokenize( t, is_split_into_words=True, **kwargs) for t in text))) return self.convert_tokens_to_ids(tokens) else: return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], int): return text else: raise ValueError( "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." ) if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " "To use this feature, change your tokenizer to one deriving from " "transformers.PreTrainedTokenizerFast.") input_ids = [] for ids_or_pair_ids in batch_text_or_text_pairs: if not isinstance(ids_or_pair_ids, (list, tuple)): ids, pair_ids = ids_or_pair_ids, None elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): ids, pair_ids = ids_or_pair_ids, None else: ids, pair_ids = ids_or_pair_ids first_ids = get_input_ids(ids) second_ids = get_input_ids( pair_ids) if pair_ids is not None else None input_ids.append((first_ids, second_ids)) batch_outputs = self._batch_prepare_for_model( input_ids, add_special_tokens=add_special_tokens, padding_strategy=padding_strategy, truncation_strategy=truncation_strategy, max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=return_tensors, verbose=verbose, ) return BatchEncoding(batch_outputs) def tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens, using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Takes care of added tokens. Args: text (`str`): The sequence to be encoded. **kwargs (additional keyword arguments): Passed along to the model-specific `prepare_for_tokenization` preprocessing method. Returns: `List[str]`: The list of tokens. """ # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors all_special_tokens_extended = dict( (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)) text, kwargs = self.prepare_for_tokenization(text, **kwargs) if kwargs: logger.warning(f"Keyword arguments {kwargs} not recognized.") # TODO: should this be in the base class? if hasattr(self, "do_lower_case") and self.do_lower_case: # convert non-special tokens to lowercase escaped_special_toks = [ re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) ] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) no_split_token = set(self.unique_no_split_tokens) tokens = self.tokens_trie.split(text) # ["This is something", "", " else"] for i, token in enumerate(tokens): if token in no_split_token: tok_extended = all_special_tokens_extended.get(token, None) left = tokens[i - 1] if i > 0 else None right = tokens[i + 1] if i < len(tokens) - 1 else None if isinstance(tok_extended, AddedToken): if tok_extended.rstrip and right: # A bit counter-intuitive but we strip the left of the string # since tok_extended.rstrip means the special token is eating all white spaces on its right tokens[i + 1] = right.lstrip() # Strip white spaces on the left if tok_extended.lstrip and left: tokens[i - 1] = left.rstrip() # Opposite here else: # We strip left and right by default if right: tokens[i + 1] = right.lstrip() if left: tokens[i - 1] = left.rstrip() # ["This is something", "", "else"] tokenized_text = [] for token in tokens: # Need to skip eventual empty (fully stripped) tokens if not token: continue if token in no_split_token: tokenized_text.append(token) else: tokenized_text.extend(self._tokenize(token)) # ["This", " is", " something", "", "else"] return tokenized_text def _tokenize(self, text): """Take as input a string and return a list of strings (tokens) for words/sub-words""" return self.sp_model.encode(text, out_type=str) def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): """ Performs any necessary transformations before tokenization. This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the `kwargs` at the end of the encoding process to be sure all the arguments have been used. Args: text (`str`): The text to prepare. is_split_into_words (`bool`, *optional*, defaults to `False`): Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. kwargs: Keyword arguments to use for the tokenization. Returns: `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. """ return (text, kwargs) def convert_tokens_to_ids(self, tokens): """ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. """ if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) ids = [] for token in tokens: ids.append(self._convert_token_to_id_with_added_voc(token)) return ids def _convert_token_to_id_with_added_voc(self, token): if token is None: return None if token in self.added_tokens_encoder: return self.added_tokens_encoder[token] return self._convert_token_to_id(token) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" if token.startswith("", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def num_special_tokens_to_add(self, pair=False): """ Returns the number of added tokens when encoding a sequence with special tokens. This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put this inside your training loop. Args: pair (`bool`, *optional*, defaults to `False`): Whether the number of added tokens should be computed in the case of a sequence pair or a single sequence. Returns: `int`: Number of special tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] return len( self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A sequence has the following format: - single sequence: `X ` - pair of sequences: `A B ` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ token_ids_0 = self._add_eos_if_not_present(token_ids_0) if token_ids_1 is None: return token_ids_0 else: token_ids_1 = self._add_eos_if_not_present(token_ids_1) return token_ids_0 + token_ids_1 @staticmethod def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): if pretrained_model_name_or_path in T5Tokenizer.max_model_input_sizes: deprecated_max_model_length = T5Tokenizer.max_model_input_sizes[ pretrained_model_name_or_path] if init_max_model_length is not None and init_max_model_length != max_model_length: return init_max_model_length elif init_max_model_length is None: warnings.warn( "This tokenizer was incorrectly instantiated with a model max length of" f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this" " behavior is kept to avoid breaking backwards compatibility when padding/encoding with" " `truncation is True`.\n- Be aware that you SHOULD NOT rely on" f" {pretrained_model_name_or_path} automatically truncating your input to" f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences" f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with" " `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please" " instantiate this tokenizer with `model_max_length` set to your preferred value.", FutureWarning, ) return max_model_length @property def vocab_size(self): return self.sp_model.get_piece_size() + self._extra_ids def get_vocab(self): vocab = { self.convert_ids_to_tokens(i): i for i in range(self.vocab_size) } vocab.update(self.added_tokens_encoder) return vocab def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True) # normal case: some special tokens if token_ids_1 is None: return ([0] * len(token_ids_0)) + [1] return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] def _add_eos_if_not_present(self, token_ids): """Do not add eos again if user already added it.""" if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: warnings.warn( f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated" " eos tokens being added.") return token_ids else: return token_ids + [self.eos_token_id] def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]` of zeros. """ eos = [self.eos_token_id] if token_ids_1 is None: return len(token_ids_0 + eos) * [0] return len(token_ids_0 + eos + token_ids_1 + eos) * [0] def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A sequence has the following format: - single sequence: `X ` - pair of sequences: `A B ` Args: token_ids_0 (`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]` of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ token_ids_0 = self._add_eos_if_not_present(token_ids_0) if token_ids_1 is None: return token_ids_0 else: token_ids_1 = self._add_eos_if_not_present(token_ids_1) return token_ids_0 + token_ids_1 def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d # for backward compatibility if not hasattr(self, "sp_model_kwargs"): self.sp_model_kwargs = {} self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def _tokenize(self, text: str): """Take as input a string and return a list of strings (tokens) for words/sub-words""" return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" if token.startswith("", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: token = f"" return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" for token in tokens: # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: out_string += self.sp_model.decode_pieces( current_sub_tokens) + token + " " current_sub_tokens = [] else: current_sub_tokens.append(token) out_string += self.sp_model.decode_pieces(current_sub_tokens) return out_string.strip() def save_vocabulary(self, save_directory, filename_prefix=None): if not os.path.isdir(save_directory): logger.error( f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath( out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file, ) @classmethod def _dict_from_json_file(cls, json_file): with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() return json.loads(text) ================================================ FILE: ppfleetx/data/tokenizers/tokenization_utils_base.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2020 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers) """ import copy import json import os import re import warnings from collections import OrderedDict, UserDict from collections.abc import Mapping from contextlib import contextmanager from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union import importlib import numpy as np def is_sentencepiece_available(): return importlib.util.find_spec("sentencepiece") is not None def is_tokenizers_available(): return importlib.util.find_spec("tokenizers") is not None if is_tokenizers_available(): from tokenizers import AddedToken else: @dataclass(frozen=True, eq=True) class AddedToken: """ AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the way it should behave. """ content: str = field(default_factory=str) single_word: bool = False lstrip: bool = False rstrip: bool = False normalized: bool = True def __getstate__(self): return self.__dict__ TOKENIZER_MAPPING_NAMES = OrderedDict([ ( "albert", ( "AlbertTokenizer" if is_sentencepiece_available() else None, "AlbertTokenizerFast" if is_tokenizers_available() else None, ), ), ("bart", ("BartTokenizer", "BartTokenizerFast")), ( "barthez", ( "BarthezTokenizer" if is_sentencepiece_available() else None, "BarthezTokenizerFast" if is_tokenizers_available() else None, ), ), ("bartpho", ("BartphoTokenizer", None)), ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), ("bert-japanese", ("BertJapaneseTokenizer", None)), ("bertweet", ("BertweetTokenizer", None)), ( "big_bird", ( "BigBirdTokenizer" if is_sentencepiece_available() else None, "BigBirdTokenizerFast" if is_tokenizers_available() else None, ), ), ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)), ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)), ("byt5", ("ByT5Tokenizer", None)), ( "camembert", ( "CamembertTokenizer" if is_sentencepiece_available() else None, "CamembertTokenizerFast" if is_tokenizers_available() else None, ), ), ("canine", ("CanineTokenizer", None)), ( "clip", ( "CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None, ), ), ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), ( "cpm", ( "CpmTokenizer" if is_sentencepiece_available() else None, "CpmTokenizerFast" if is_tokenizers_available() else None, ), ), ("ctrl", ("CTRLTokenizer", None)), ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)), ( "deberta-v2", ( "DebertaV2Tokenizer" if is_sentencepiece_available() else None, "DebertaV2TokenizerFast" if is_tokenizers_available() else None, ), ), ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), ( "dpr", ( "DPRQuestionEncoderTokenizer", "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None, ), ), ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)), ("flaubert", ("FlaubertTokenizer", None)), ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), ("hubert", ("Wav2Vec2CTCTokenizer", None)), ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)), ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ( "longt5", ( "T5Tokenizer" if is_sentencepiece_available() else None, "T5TokenizerFast" if is_tokenizers_available() else None, ), ), ("luke", ("LukeTokenizer", None)), ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)), ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), ( "mbart", ( "MBartTokenizer" if is_sentencepiece_available() else None, "MBartTokenizerFast" if is_tokenizers_available() else None, ), ), ( "mbart50", ( "MBart50Tokenizer" if is_sentencepiece_available() else None, "MBart50TokenizerFast" if is_tokenizers_available() else None, ), ), ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)), ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)), ( "mt5", ( "MT5Tokenizer" if is_sentencepiece_available() else None, "MT5TokenizerFast" if is_tokenizers_available() else None, ), ), ( "nystromformer", ( "AlbertTokenizer" if is_sentencepiece_available() else None, "AlbertTokenizerFast" if is_tokenizers_available() else None, ), ), ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)), ("opt", ("GPT2Tokenizer", None)), ( "pegasus", ( "PegasusTokenizer" if is_sentencepiece_available() else None, "PegasusTokenizerFast" if is_tokenizers_available() else None, ), ), ( "perceiver", ( "PerceiverTokenizer", None, ), ), ("phobert", ("PhobertTokenizer", None)), ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), ("prophetnet", ("ProphetNetTokenizer", None)), ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("rag", ("RagTokenizer", None)), ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)), ( "reformer", ( "ReformerTokenizer" if is_sentencepiece_available() else None, "ReformerTokenizerFast" if is_tokenizers_available() else None, ), ), ( "rembert", ( "RemBertTokenizer" if is_sentencepiece_available() else None, "RemBertTokenizerFast" if is_tokenizers_available() else None, ), ), ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)), ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)), ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")), ( "squeezebert", ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None), ), ( "t5", ( "T5Tokenizer" if is_sentencepiece_available() else None, "T5TokenizerFast" if is_tokenizers_available() else None, ), ), ("tapas", ("TapasTokenizer", None)), ("tapex", ("TapexTokenizer", None)), ("transfo-xl", ("TransfoXLTokenizer", None)), ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)), ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)), ( "xglm", ( "XGLMTokenizer" if is_sentencepiece_available() else None, "XGLMTokenizerFast" if is_tokenizers_available() else None, ), ), ("xlm", ("XLMTokenizer", None)), ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)), ( "xlm-roberta", ( "XLMRobertaTokenizer" if is_sentencepiece_available() else None, "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, ), ), ("xlm-roberta-xl", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ( "xlnet", ( "XLNetTokenizer" if is_sentencepiece_available() else None, "XLNetTokenizerFast" if is_tokenizers_available() else None, ), ), ( "yoso", ( "AlbertTokenizer" if is_sentencepiece_available() else None, "AlbertTokenizerFast" if is_tokenizers_available() else None, ), ), ]) SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([ ("openai-gpt", "openai"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec"), ("data2vec-vision", "data2vec"), ]) def model_type_to_module_name(key): """Converts a config key to the corresponding module.""" # Special treatment if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] return key.replace("-", "_") class _LazyConfigMapping(OrderedDict): """ A dictionary that lazily load its values when they are requested. """ def __init__(self, mapping): self._mapping = mapping self._extra_content = {} self._modules = {} def __getitem__(self, key): if key in self._extra_content: return self._extra_content[key] if key not in self._mapping: raise KeyError(key) value = self._mapping[key] module_name = model_type_to_module_name(key) if module_name not in self._modules: self._modules[module_name] = importlib.import_module( f".{module_name}", "transformers.models") if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value) # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the # object at the top level. transformers_module = importlib.import_module("transformers") return getattr(transformers_module, value) def keys(self): return list(self._mapping.keys()) + list(self._extra_content.keys()) def values(self): return [self[k] for k in self._mapping.keys()] + list( self._extra_content.values()) def items(self): return [(k, self[k]) for k in self._mapping.keys()] + list( self._extra_content.items()) def __iter__(self): return iter( list(self._mapping.keys()) + list(self._extra_content.keys())) def __contains__(self, item): return item in self._mapping or item in self._extra_content def register(self, key, value): """ Register a new configuration in this mapping. """ if key in self._mapping.keys(): raise ValueError( f"'{key}' is already used by a Transformers config, pick another name." ) self._extra_content[key] = value class Trie: """ Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass Loose reference https://en.wikipedia.org/wiki/Trie """ def __init__(self): self.data = {} def add(self, word: str): """ Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. The special key `""` is used to represent termination. This function is idempotent, adding twice the same word will leave the trie unchanged Example: ```python >>> trie = Trie() >>> trie.add("Hello 友達") >>> trie.data {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} >>> trie.add("Hello") >>> trie.data {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} ``` """ if not word: # Prevent empty string return ref = self.data for char in word: ref[char] = char in ref and ref[char] or {} ref = ref[char] ref[""] = 1 def split(self, text: str) -> List[str]: """ Will look for the words added to the trie within `text`. Output is the original string splitted along the boundaries of the words found. This trie will match the longest possible word first ! Example: ```python >>> trie = Trie() >>> trie.split("[CLS] This is a extra_id_100") ["[CLS] This is a extra_id_100"] >>> trie.add("[CLS]") >>> trie.add("extra_id_1") >>> trie.add("extra_id_100") >>> trie.split("[CLS] This is a extra_id_100") ["[CLS]", " This is a ", "extra_id_100"] ``` """ # indexes are counted left of the chars index. # "hello", index 0, is left of h, index 1 is between h and e. # index 5 is right of the "o". # States are going to capture every possible start (indexes as above) # as keys, and have as values, a pointer to the position in the trie # where we're at. This is a partial match for now. # This enables to keep track of multiple matches while we're iterating # the string # If the trie contains, "blowing", and "lower" and we encounter the # string "blower", we need to split into ["b", "lower"]. # This is where we need to keep track of multiple possible starts. states = OrderedDict() # This will contain every indices where we need # to cut. # We force to cut at offset 0 and len(text) (added later) offsets = [0] # This is used by the lookahead which needs to skip over # some text where the full match exceeded the place in the initial # for loop skip = 0 # Main loop, Giving this algorithm O(n) complexity for current, current_char in enumerate(text): if skip and current < skip: # Prevents the lookahead for matching twice # like extra_id_100 and id_100 continue # This will track every state # that stop matching, we need to stop tracking them. # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then # fail on "b", we need to remove 0 from the valid states. to_remove = set() # Whenever we found a match, we need to drop everything # this is a greedy algorithm, it will match on the first found token reset = False # In this case, we already have partial matches (But unfinished) for start, trie_pointer in states.items(): if "" in trie_pointer: # This is a final match, we need to reset and # store the results in `offsets`. # Lookahead to match longest first # Important in case of extra_id_1 vs extra_id_100 # Here we are also actively looking for other earlier partial # matches # "[CLS]", "L", we need to match CLS even if L is special for lookstart, looktrie_pointer in states.items(): if lookstart > start: # This partial match is later, we can stop looking break elif lookstart < start: # This partial match is earlier, the trie pointer # was already updated, so index is + 1 lookahead_index = current + 1 end = current + 1 else: # Here lookstart == start and # looktrie_pointer == trie_pointer # It wasn't updated yet so indices are current ones lookahead_index = current end = current next_char = text[ lookahead_index] if lookahead_index < len( text) else None if "" in looktrie_pointer: start = lookstart end = lookahead_index skip = lookahead_index while next_char in looktrie_pointer: looktrie_pointer = looktrie_pointer[next_char] lookahead_index += 1 if "" in looktrie_pointer: start = lookstart end = lookahead_index skip = lookahead_index if lookahead_index == len(text): # End of string break next_char = text[lookahead_index] # End lookahead # Storing and resetting offsets.append(start) offsets.append(end) reset = True break elif current_char in trie_pointer: # The current character being looked at has a match within the trie # update the pointer (it will be stored back into states later). trie_pointer = trie_pointer[current_char] # Storing back the new pointer into the states. # Partial matches got longer by one. states[start] = trie_pointer else: # The new character has not match in the trie, we need # to stop keeping track of this partial match. # We can't do it directly within the loop because of how # python iteration works to_remove.add(start) # Either clearing the full start (we found a real match) # Or clearing only the partial matches that didn't work. if reset: states = {} else: for start in to_remove: del states[start] # If this character is a starting character within the trie # start keeping track of this partial match. if current >= skip and current_char in self.data: states[current] = self.data[current_char] # We have a cut at the end with states. for start, trie_pointer in states.items(): if "" in trie_pointer: # This is a final match, we need to reset and # store the results in `offsets`. end = len(text) offsets.append(start) offsets.append(end) # Longest cut is always the one with lower start so the first # item so we need to break. break return self.cut_text(text, offsets) def cut_text(self, text, offsets): # We have all the offsets now, we just need to do the actual splitting. # We need to eventually add the first part of the string and the eventual # last part. offsets.append(len(text)) tokens = [] start = 0 for end in offsets: if start > end: logger.error( "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" " anyway.") continue elif start == end: # This might happen if there's a match at index 0 # we're also preventing zero-width cuts in case of two # consecutive matches continue tokens.append(text[start:end]) start = end return tokens from enum import Enum class ExplicitEnum(Enum): """ Enum with more explicit error message for missing values. """ @classmethod def _missing_(cls, value): raise ValueError( f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}" ) class TensorType(ExplicitEnum): """ Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an IDE. """ PADDLE = "paddle" PYTORCH = "pt" TENSORFLOW = "tf" NUMPY = "np" JAX = "jax" class BatchEncoding(UserDict): """ Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`], [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc). This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes utility methods to map from word/character space to token space. Args: data (`dict`): Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods ('input_ids', 'attention_mask', etc.). encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*): If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this information. tensor_type (`Union[None, str, TensorType]`, *optional*): You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at initialization. prepend_batch_axis (`bool`, *optional*, defaults to `False`): Whether or not to add a batch axis when converting to tensors (see `tensor_type` above). n_sequences (`Optional[int]`, *optional*): You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at initialization. """ def __init__( self, data=None, encoding=None, tensor_type=None, prepend_batch_axis: bool=False, n_sequences=None, ): super().__init__(data) #if isinstance(encoding, EncodingFast): # encoding = [encoding] self._encodings = encoding if n_sequences is None and encoding is not None and len(encoding): n_sequences = encoding[0].n_sequences self._n_sequences = n_sequences self.convert_to_tensors( tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) @property def n_sequences(self) -> Optional[int]: """ `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of sentences) """ return self._n_sequences @property def is_fast(self) -> bool: """ `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`] or not. """ return self._encodings is not None # def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]: def __getitem__(self, item): """ If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.). If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`. """ if isinstance(item, str): return self.data[item] elif self._encodings is not None: return self._encodings[item] else: raise KeyError( "Indexing with integers (to access backend Encoding for a given batch index) " "is not available when using Python based tokenizers") def __getattr__(self, item: str): try: return self.data[item] except KeyError: raise AttributeError def __getstate__(self): return {"data": self.data, "encodings": self._encodings} def __setstate__(self, state): if "data" in state: self.data = state["data"] if "encodings" in state: self._encodings = state["encodings"] def keys(self): return self.data.keys() def values(self): return self.data.values() def items(self): return self.data.items() # After this point: # Extended properties and methods only available for fast (Rust-based) tokenizers # provided by HuggingFace tokenizers library. @property def encodings(self): """ `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if the input was tokenized through Python (i.e., not a fast) tokenizer. """ return self._encodings def tokens(self, batch_index=0): """ Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to integer indices) at a given batch index (only works for the output of a fast tokenizer). Args: batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. Returns: `List[str]`: The list of tokens at that index. """ if not self._encodings: raise ValueError( "tokens() is not available when using Python-based tokenizers") return self._encodings[batch_index].tokens def sequence_ids(self, batch_index=0): """ Return a list mapping the tokens to the id of their original sentences: - `None` for special tokens added around or between sequences, - `0` for tokens corresponding to words in the first sequence, - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly encoded. Args: batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. Returns: `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding sequence. """ if not self._encodings: raise ValueError( "sequence_ids() is not available when using Python-based tokenizers" ) return self._encodings[batch_index].sequence_ids def words(self, batch_index=0): """ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. Args: batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. Returns: `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word). """ if not self._encodings: raise ValueError( "words() is not available when using Python-based tokenizers") warnings.warn( "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, " "but more self-explanatory `BatchEncoding.word_ids()` property.", FutureWarning, ) return self.word_ids(batch_index) def word_ids(self, batch_index: int=0) -> List[Optional[int]]: """ Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. Args: batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. Returns: `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word). """ if not self._encodings: raise ValueError( "word_ids() is not available when using Python-based tokenizers" ) return self._encodings[batch_index].word_ids def token_to_sequence(self, batch_or_token_index, token_index): """ Get the index of the sequence represented by the given token. In the general use case, this method returns `0` for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair Can be called as: - `self.token_to_sequence(token_index)` if batch size is 1 - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_token_index (`int`): Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of the token in the sequence. token_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the sequence. Returns: `int`: Index of the word in the input sequence. """ if not self._encodings: raise ValueError( "token_to_sequence() is not available when using Python based tokenizers" ) if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index if batch_index < 0: batch_index = self._batch_size + batch_index if token_index < 0: token_index = self._seq_len + token_index return self._encodings[batch_index].token_to_sequence(token_index) def token_to_word(self, batch_or_token_index, token_index=None): """ Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch. Can be called as: - `self.token_to_word(token_index)` if batch size is 1 - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_token_index (`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the token in the sequence. token_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the sequence. Returns: `int`: Index of the word in the input sequence. """ if not self._encodings: raise ValueError( "token_to_word() is not available when using Python based tokenizers" ) if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index if batch_index < 0: batch_index = self._batch_size + batch_index if token_index < 0: token_index = self._seq_len + token_index return self._encodings[batch_index].token_to_word(token_index) def word_to_tokens(self, batch_or_word_index, word_index=None, sequence_index=0): """ Get the encoded token span corresponding to a word in a sequence of the batch. Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with: - **start** -- Index of the first token. - **end** -- Index of the token following the last token. Can be called as: - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1 - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_word_index (`int`): Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of the word in the sequence. word_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the sequence. sequence_index (`int`, *optional*, defaults to 0): If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 or 1) the provided word index belongs to. Returns: Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if no tokens correspond to the word. """ if not self._encodings: raise ValueError( "word_to_tokens() is not available when using Python based tokenizers" ) if word_index is not None: batch_index = batch_or_word_index else: batch_index = 0 word_index = batch_or_word_index if batch_index < 0: batch_index = self._batch_size + batch_index if word_index < 0: word_index = self._seq_len + word_index span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index) return TokenSpan(*span) if span is not None else None def token_to_chars(self, batch_or_token_index: int, token_index=None): """ Get the character span corresponding to an encoded token in a sequence of the batch. Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with: - **start** -- Index of the first character in the original string associated to the token. - **end** -- Index of the character following the last character in the original string associated to the token. Can be called as: - `self.token_to_chars(token_index)` if batch size is 1 - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1 Args: batch_or_token_index (`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the token in the sequence. token_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in the sequence. Returns: [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token (e.g. , ) doesn't correspond to any chars in the origin string. """ if not self._encodings: raise ValueError( "token_to_chars() is not available when using Python based tokenizers" ) if token_index is not None: batch_index = batch_or_token_index else: batch_index = 0 token_index = batch_or_token_index span_indices = self._encodings[batch_index].token_to_chars(token_index) return CharSpan(*span_indices) if span_indices is not None else None def char_to_token(self, batch_or_char_index: int, char_index: Optional[int]=None, sequence_index: int=0) -> int: """ Get the index of the token in the encoded output comprising a character in the original string for a sequence of the batch. Can be called as: - `self.char_to_token(char_index)` if batch size is 1 - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_char_index (`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the word in the sequence char_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the sequence. sequence_index (`int`, *optional*, defaults to 0): If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 or 1) the provided character index belongs to. Returns: `int`: Index of the token. """ if not self._encodings: raise ValueError( "char_to_token() is not available when using Python based tokenizers" ) if char_index is not None: batch_index = batch_or_char_index else: batch_index = 0 char_index = batch_or_char_index return self._encodings[batch_index].char_to_token(char_index, sequence_index) def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int]=None, sequence_index: int=0): """ Get the character span in the original string corresponding to given word in a sequence of the batch. Character spans are returned as a CharSpan NamedTuple with: - start: index of the first character in the original string - end: index of the character following the last character in the original string Can be called as: - `self.word_to_chars(word_index)` if batch size is 1 - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1 Args: batch_or_word_index (`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the word in the sequence word_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the sequence. sequence_index (`int`, *optional*, defaults to 0): If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 or 1) the provided word index belongs to. Returns: `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan are NamedTuple with: - start: index of the first character associated to the token in the original string - end: index of the character following the last character associated to the token in the original string """ if not self._encodings: raise ValueError( "word_to_chars() is not available when using Python based tokenizers" ) if word_index is not None: batch_index = batch_or_word_index else: batch_index = 0 word_index = batch_or_word_index return CharSpan(*(self._encodings[batch_index].word_to_chars( word_index, sequence_index))) def char_to_word(self, batch_or_char_index: int, char_index: Optional[int]=None, sequence_index: int=0) -> int: """ Get the word in the original string corresponding to a character in the original string of a sequence of the batch. Can be called as: - `self.char_to_word(char_index)` if batch size is 1 - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1 This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_char_index (`int`): Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of the character in the original string. char_index (`int`, *optional*): If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the original string. sequence_index (`int`, *optional*, defaults to 0): If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 or 1) the provided character index belongs to. Returns: `int` or `List[int]`: Index or indices of the associated encoded token(s). """ if not self._encodings: raise ValueError( "char_to_word() is not available when using Python based tokenizers" ) if char_index is not None: batch_index = batch_or_char_index else: batch_index = 0 char_index = batch_or_char_index return self._encodings[batch_index].char_to_word(char_index, sequence_index) def convert_to_tensors(self, tensor_type=None, prepend_batch_axis: bool=False): """ Convert the inner content to tensors. Args: tensor_type (`str` or [`~utils.TensorType`], *optional*): The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If `None`, no modification is done. prepend_batch_axis (`int`, *optional*, defaults to `False`): Whether or not to add the batch dimension during the conversion. """ if tensor_type is None: return self # Get a function reference for the correct framework if tensor_type == 'paddle': import paddle as_tensor = paddle.to_tensor is_tensor = paddle.is_tensor else: as_tensor = np.asarray is_tensor = _is_numpy # (mfuntowicz: This code is unreachable) # else: # raise ImportError( # f"Unable to convert output to tensors format {tensor_type}" # ) # Do the tensor conversion in batch for key, value in self.items(): try: if prepend_batch_axis: value = [value] if not is_tensor(value): tensor = as_tensor(value) # Removing this for now in favor of controlling the shape with `prepend_batch_axis` # # at-least2d # if tensor.ndim > 2: # tensor = tensor.squeeze(0) # elif tensor.ndim < 2: # tensor = tensor[None, :] self[key] = tensor except: # noqa E722 if key == "overflowing_tokens": raise ValueError( "Unable to create tensor returning overflowing tokens of different lengths. " "Please see if a fast version of this tokenizer is available to have this feature available." ) raise ValueError( "Unable to create tensor, you should probably activate truncation and/or padding " "with 'padding=True' 'truncation=True' to have batched tensors with the same length." ) return self class TruncationStrategy(ExplicitEnum): """ Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an IDE. """ ONLY_FIRST = "only_first" ONLY_SECOND = "only_second" LONGEST_FIRST = "longest_first" DO_NOT_TRUNCATE = "do_not_truncate" class PaddingStrategy(ExplicitEnum): """ Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an IDE. """ LONGEST = "longest" MAX_LENGTH = "max_length" DO_NOT_PAD = "do_not_pad" class SpecialTokensMixin: """ A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be used to directly access these special tokens in a model-independent manner and allow to set and update the special tokens. Args: bos_token (`str` or `tokenizers.AddedToken`, *optional*): A special token representing the beginning of a sentence. eos_token (`str` or `tokenizers.AddedToken`, *optional*): A special token representing the end of a sentence. unk_token (`str` or `tokenizers.AddedToken`, *optional*): A special token representing an out-of-vocabulary token. sep_token (`str` or `tokenizers.AddedToken`, *optional*): A special token separating two different sentences in the same input (used by BERT for instance). pad_token (`str` or `tokenizers.AddedToken`, *optional*): A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by attention mechanisms or loss computation. cls_token (`str` or `tokenizers.AddedToken`, *optional*): A special token representing the class of the input (used by BERT for instance). mask_token (`str` or `tokenizers.AddedToken`, *optional*): A special token representing a masked token (used by masked-language modeling pretraining objectives, like BERT). additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): A tuple or a list of additional special tokens. """ SPECIAL_TOKENS_ATTRIBUTES = [ "bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token", "additional_special_tokens", ] def __init__(self, verbose=True, **kwargs): self._bos_token = None self._eos_token = None self._unk_token = None self._sep_token = None self._pad_token = None self._cls_token = None self._mask_token = None self._pad_token_type_id = 0 self._additional_special_tokens = [] self.verbose = verbose self.added_tokens_encoder: Dict[str, int] = {} self.added_tokens_decoder: Dict[int, str] = {} self.unique_no_split_tokens: List[str] = [] self.tokens_trie = Trie() self._decode_use_source_tokenizer = False # We directly set the hidden value to allow initialization with special tokens # which are not yet in the vocabulary. Necessary for serialization/de-serialization # TODO clean this up at some point (probably by switching to fast tokenizers) for key, value in kwargs.items(): if value is None: continue if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key == "additional_special_tokens": assert isinstance(value, ( list, tuple)), f"Value {value} is not a list or tuple" assert all( isinstance(t, (str, AddedToken)) for t in value ), "One of the tokens is not a string or an AddedToken" setattr(self, key, value) elif isinstance(value, (str, AddedToken)): setattr(self, key, value) else: raise TypeError( f"special token {key} has to be either str or AddedToken but got: {type(value)}" ) def convert_tokens_to_ids( self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: """ Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. Args: tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. """ if tokens is None: return None if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) ids = [] for token in tokens: ids.append(self._convert_token_to_id_with_added_voc(token)) return ids def _convert_token_to_id_with_added_voc(self, token): if token is None: return None if token in self.added_tokens_encoder: return self.added_tokens_encoder[token] return self._convert_token_to_id(token) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" if token.startswith("", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def sanitize_special_tokens(self) -> int: """ Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the vocabulary. Add the missing ones to the vocabulary if needed. Return: `int`: The number of tokens added in the vocabulary during the operation. """ return self.add_tokens( self.all_special_tokens_extended, special_tokens=True) def add_special_tokens( self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: """ Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the current vocabulary). Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the model so that its embedding matrix matches the tokenizer. In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method. Using `add_special_tokens` will ensure your special tokens can be used in several ways: - Special tokens are carefully handled by the tokenizer (they are never split). - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. When possible, special tokens are already registered for provided pretrained models (for instance [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be `''`). Args: special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`): Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`]. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). Returns: `int`: Number of tokens added to the vocabulary. Examples: ```python # Let's see how to add a new classification token to GPT-2 tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2Model.from_pretrained("gpt2") special_tokens_dict = {"cls_token": ""} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print("We have added", num_added_toks, "tokens") # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) assert tokenizer.cls_token == "" ```""" if not special_tokens_dict: return 0 added_tokens = 0 for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token" if self.verbose: #logger.info(f"Assigning {value} to the {key} key of the tokenizer") print(f"Assigning {value} to the {key} key of the tokenizer") setattr(self, key, value) if key == "additional_special_tokens": assert isinstance(value, (list, tuple)) and all( isinstance(t, (str, AddedToken)) for t in value ), f"Tokens {value} for key {key} should all be str or AddedToken instances" added_tokens += self.add_tokens(value, special_tokens=True) else: assert isinstance( value, (str, AddedToken) ), f"Token {value} for key {key} should be a str or an AddedToken instance" added_tokens += self.add_tokens([value], special_tokens=True) return added_tokens def add_tokens( self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool=False) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the model so that its embedding matrix matches the tokenizer. In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method. Args: new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`): Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string token to let you personalize its behavior: whether this token should only match against a single word, whether this token should strip all potential whitespaces on the left side, whether this token should strip all potential whitespaces on the right side, etc. special_tokens (`bool`, *optional*, defaults to `False`): Can be used to specify if the token is a special token. This mostly change the normalization behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance). See details for `tokenizers.AddedToken` in HuggingFace tokenizers library. Returns: `int`: Number of tokens added to the vocabulary. Examples: ```python # Let's see how to increase the vocabulary of Bert model and tokenizer tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") model = BertModel.from_pretrained("bert-base-uncased") num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) print("We have added", num_added_toks, "tokens") # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) ```""" if not new_tokens: return 0 if not isinstance(new_tokens, (list, tuple)): new_tokens = [new_tokens] return self._add_tokens(new_tokens, special_tokens=special_tokens) def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool=False) -> int: new_tokens = [str(tok) for tok in new_tokens] tokens_to_add = [] for token in new_tokens: if not isinstance(token, str): raise TypeError( f"Token {token} is not a string but a {type(token)}.") if not special_tokens and hasattr( self, "do_lower_case") and self.do_lower_case: token = token.lower() if (token != self.unk_token and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and token not in tokens_to_add): tokens_to_add.append(token) #if self.verbose: #logger.info(f"Adding {token} to the vocabulary") #print(f"Adding {token} to the vocabulary") added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) self.added_tokens_decoder.update(added_tok_decoder) # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: if len(new_tokens) == 1: _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0]) else: self.unique_no_split_tokens = sorted( set(self.unique_no_split_tokens).union(set(new_tokens))) else: # Or on the newly added tokens if len(tokens_to_add) == 1: _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0]) else: self.unique_no_split_tokens = sorted( set(self.unique_no_split_tokens).union( set(tokens_to_add))) self._create_trie(self.unique_no_split_tokens) return len(tokens_to_add) def _create_trie(self, unique_no_split_tokens): trie = Trie() for token in unique_no_split_tokens: if hasattr( self, "do_lower_case" ) and self.do_lower_case and token not in self.all_special_tokens: trie.add(token.lower()) else: trie.add(token) self.tokens_trie = trie @property def bos_token(self) -> str: """ `str`: Beginning of sentence token. Log an error if used while not having been set. """ if self._bos_token is None and self.verbose: print("Using bos_token, but it is not set yet.") #logger.error("Using bos_token, but it is not set yet.") return None return str(self._bos_token) @property def eos_token(self) -> str: """ `str`: End of sentence token. Log an error if used while not having been set. """ if self._eos_token is None and self.verbose: #logger.error("Using eos_token, but it is not set yet.") print("Using eos_token, but it is not set yet.") return None return str(self._eos_token) @property def unk_token(self) -> str: """ `str`: Unknown token. Log an error if used while not having been set. """ if self._unk_token is None and self.verbose: print("Using unk_token, but it is not set yet.") #logger.error("Using unk_token, but it is not set yet.") return None return str(self._unk_token) @property def sep_token(self) -> str: """ `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not having been set. """ if self._sep_token is None and self.verbose: print("Using sep_token, but it is not set yet.") #logger.error("Using sep_token, but it is not set yet.") return None return str(self._sep_token) @property def pad_token(self) -> str: """ `str`: Padding token. Log an error if used while not having been set. """ if self._pad_token is None and self.verbose: #logger.error("Using pad_token, but it is not set yet.") print("Using pad_token, but it is not set yet.") return None return str(self._pad_token) @property def cls_token(self) -> str: """ `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ if self._cls_token is None and self.verbose: #logger.error("Using cls_token, but it is not set yet.") print("Using cls_token, but it is not set yet.") return None return str(self._cls_token) @property def mask_token(self) -> str: """ `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not having been set. """ if self._mask_token is None and self.verbose: #logger.error("Using mask_token, but it is not set yet.") print("Using mask_token, but it is not set yet.") return None return str(self._mask_token) @property def additional_special_tokens(self) -> List[str]: """ `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been set. """ if self._additional_special_tokens is None and self.verbose: #logger.error("Using additional_special_tokens, but it is not set yet.") print("Using additional_special_tokens, but it is not set yet.") return None return [str(tok) for tok in self._additional_special_tokens] @bos_token.setter def bos_token(self, value): self._bos_token = value @eos_token.setter def eos_token(self, value): self._eos_token = value @unk_token.setter def unk_token(self, value): self._unk_token = value @sep_token.setter def sep_token(self, value): self._sep_token = value @pad_token.setter def pad_token(self, value): self._pad_token = value @cls_token.setter def cls_token(self, value): self._cls_token = value @mask_token.setter def mask_token(self, value): self._mask_token = value @additional_special_tokens.setter def additional_special_tokens(self, value): self._additional_special_tokens = value @property def bos_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not been set. """ if self._bos_token is None: return None return self.convert_tokens_to_ids(self.bos_token) @property def eos_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been set. """ if self._eos_token is None: return None return self.convert_tokens_to_ids(self.eos_token) @property def unk_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set. """ if self._unk_token is None: return None return self.convert_tokens_to_ids(self.unk_token) @property def sep_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input sequence. Returns `None` if the token has not been set. """ if self._sep_token is None: return None return self.convert_tokens_to_ids(self.sep_token) @property def pad_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set. """ if self._pad_token is None: return None return self.convert_tokens_to_ids(self.pad_token) @property def pad_token_type_id(self) -> int: """ `int`: Id of the padding token type in the vocabulary. """ return self._pad_token_type_id @property def cls_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Returns `None` if the token has not been set. """ if self._cls_token is None: return None return self.convert_tokens_to_ids(self.cls_token) @property def mask_token_id(self) -> Optional[int]: """ `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language modeling. Returns `None` if the token has not been set. """ if self._mask_token is None: return None return self.convert_tokens_to_ids(self.mask_token) @property def additional_special_tokens_ids(self) -> List[int]: """ `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.additional_special_tokens) @bos_token_id.setter def bos_token_id(self, value): self._bos_token = self.convert_ids_to_tokens( value) if value is not None else None @eos_token_id.setter def eos_token_id(self, value): self._eos_token = self.convert_ids_to_tokens( value) if value is not None else None @unk_token_id.setter def unk_token_id(self, value): self._unk_token = self.convert_ids_to_tokens( value) if value is not None else None @sep_token_id.setter def sep_token_id(self, value): self._sep_token = self.convert_ids_to_tokens( value) if value is not None else None @pad_token_id.setter def pad_token_id(self, value): self._pad_token = self.convert_ids_to_tokens( value) if value is not None else None @cls_token_id.setter def cls_token_id(self, value): self._cls_token = self.convert_ids_to_tokens( value) if value is not None else None @mask_token_id.setter def mask_token_id(self, value): self._mask_token = self.convert_ids_to_tokens( value) if value is not None else None @additional_special_tokens_ids.setter def additional_special_tokens_ids(self, values): self._additional_special_tokens = [ self.convert_ids_to_tokens(value) for value in values ] @property def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]: """ `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`''`, `''`, etc.). Convert potential tokens of `tokenizers.AddedToken` type to string. """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: attr_value = getattr(self, "_" + attr) if attr_value: set_attr[attr] = (type(attr_value)( str(attr_value_sub) for attr_value_sub in attr_value) if isinstance(attr_value, (list, tuple)) else str(attr_value)) return set_attr @property def special_tokens_map_extended(self) -> Dict[str, Union[ str, AddedToken, List[Union[str, AddedToken]]]]: """ `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`''`, `''`, etc.). Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how special tokens are tokenized. """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: attr_value = getattr(self, "_" + attr) if attr_value: set_attr[attr] = attr_value return set_attr @property def all_special_tokens(self) -> List[str]: """ `List[str]`: All the special tokens (`''`, `''`, etc.) mapped to class attributes. Convert tokens of `tokenizers.AddedToken` type to string. """ all_toks = [str(s) for s in self.all_special_tokens_extended] return all_toks @property def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: """ `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.) mapped to class attributes. Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how special tokens are tokenized. """ all_toks = [] set_attr = self.special_tokens_map_extended for attr_value in set_attr.values(): all_toks = all_toks + (list(attr_value) if isinstance(attr_value, ( list, tuple)) else [attr_value]) all_toks = list(OrderedDict.fromkeys(all_toks)) return all_toks @property def all_special_ids(self) -> List[int]: """ `List[int]`: List the ids of the special tokens(`''`, `''`, etc.) mapped to class attributes. """ all_toks = self.all_special_tokens all_ids = self.convert_tokens_to_ids(all_toks) return all_ids ================================================ FILE: ppfleetx/data/transforms/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/data/transforms/preprocess.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals from functools import partial import math import random import cv2 import numpy as np from PIL import Image from PIL import ImageFilter from paddle.vision.transforms import functional as F from paddle.vision.transforms import ColorJitter as PPColorJitter from paddle.vision.transforms import Grayscale from ppfleetx.utils.log import logger class OperatorParamError(ValueError): """ OperatorParamError """ pass class DecodeImage(object): """ decode image """ def __init__(self, to_rgb=True, channel_first=False): self.to_rgb = to_rgb self.channel_first = channel_first def __call__(self, img): assert type(img) is bytes and len( img) > 0, "invalid input 'img' in DecodeImage" data = np.frombuffer(img, dtype='uint8') img = cv2.imdecode(data, 1) if self.to_rgb: assert img.shape[2] == 3, 'invalid shape of image[%s]' % ( img.shape) img = img[:, :, ::-1] if self.channel_first: img = img.transpose((2, 0, 1)) return img class UnifiedResize(object): def __init__(self, interpolation=None, backend="cv2"): _cv2_interp_from_str = { 'nearest': cv2.INTER_NEAREST, 'bilinear': cv2.INTER_LINEAR, 'area': cv2.INTER_AREA, 'bicubic': cv2.INTER_CUBIC, 'lanczos': cv2.INTER_LANCZOS4 } _pil_interp_from_str = { 'nearest': Image.NEAREST, 'bilinear': Image.BILINEAR, 'bicubic': Image.BICUBIC, 'box': Image.BOX, 'lanczos': Image.LANCZOS, 'hamming': Image.HAMMING } def _pil_resize(src, size, resample): pil_img = Image.fromarray(src) pil_img = pil_img.resize(size, resample) return np.asarray(pil_img) if backend.lower() == "cv2": if isinstance(interpolation, str): interpolation = _cv2_interp_from_str[interpolation.lower()] # compatible with opencv < version 4.4.0 elif interpolation is None: interpolation = cv2.INTER_LINEAR self.resize_func = partial(cv2.resize, interpolation=interpolation) elif backend.lower() == "pil": if isinstance(interpolation, str): interpolation = _pil_interp_from_str[interpolation.lower()] self.resize_func = partial(_pil_resize, resample=interpolation) else: logger.warning( f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead." ) self.resize_func = cv2.resize def __call__(self, src, size): return self.resize_func(src, size) class ResizeImage(object): """ resize image """ def __init__(self, size=None, resize_short=None, interpolation=None, backend="cv2"): if resize_short is not None and resize_short > 0: self.resize_short = resize_short self.w = None self.h = None elif size is not None: self.resize_short = None self.w = size if type(size) is int else size[0] self.h = size if type(size) is int else size[1] else: raise OperatorParamError("invalid params for ReisizeImage for '\ 'both 'size' and 'resize_short' are None") self._resize_func = UnifiedResize( interpolation=interpolation, backend=backend) def __call__(self, img): img_h, img_w = img.shape[:2] if self.resize_short is not None: percent = float(self.resize_short) / min(img_w, img_h) w = int(round(img_w * percent)) h = int(round(img_h * percent)) else: w = self.w h = self.h return self._resize_func(img, (w, h)) class CenterCropImage(object): """ crop image """ def __init__(self, size): if type(size) is int: self.size = (size, size) else: self.size = size # (h, w) def __call__(self, img): w, h = self.size img_h, img_w = img.shape[:2] w_start = (img_w - w) // 2 h_start = (img_h - h) // 2 w_end = w_start + w h_end = h_start + h return img[h_start:h_end, w_start:w_end, :] class RandCropImage(object): """ random crop image """ def __init__(self, size, scale=None, ratio=None, interpolation=None, backend="cv2"): if type(size) is int: self.size = (size, size) # (h, w) else: self.size = size self.scale = [0.08, 1.0] if scale is None else scale self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio self._resize_func = UnifiedResize( interpolation=interpolation, backend=backend) def __call__(self, img): size = self.size scale = self.scale ratio = self.ratio aspect_ratio = math.sqrt(random.uniform(*ratio)) w = 1. * aspect_ratio h = 1. / aspect_ratio img_h, img_w = img.shape[:2] bound = min((float(img_w) / img_h) / (w**2), (float(img_h) / img_w) / (h**2)) scale_max = min(scale[1], bound) scale_min = min(scale[0], bound) target_area = img_w * img_h * random.uniform(scale_min, scale_max) target_size = math.sqrt(target_area) w = int(target_size * w) h = int(target_size * h) i = random.randint(0, img_w - w) j = random.randint(0, img_h - h) img = img[j:j + h, i:i + w, :] return self._resize_func(img, size) class RandFlipImage(object): """ random flip image flip_code: 1: Flipped Horizontally 0: Flipped Vertically -1: Flipped Horizontally & Vertically """ def __init__(self, flip_code=1): assert flip_code in [-1, 0, 1 ], "flip_code should be a value in [-1, 0, 1]" self.flip_code = flip_code def __call__(self, img): if random.randint(0, 1) == 1: return cv2.flip(img, self.flip_code) else: return img class NormalizeImage(object): """ normalize image such as substract mean, divide std """ def __init__(self, scale=None, mean=None, std=None, order='chw', output_fp16=False, channel_num=3): if isinstance(scale, str): scale = eval(scale) assert channel_num in [ 3, 4 ], "channel number of input image should be set to 3 or 4." self.channel_num = channel_num self.output_dtype = 'float16' if output_fp16 else 'float32' self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) self.order = order mean = mean if mean is not None else [0.485, 0.456, 0.406] std = std if std is not None else [0.229, 0.224, 0.225] shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3) self.mean = np.array(mean).reshape(shape).astype('float32') self.std = np.array(std).reshape(shape).astype('float32') def __call__(self, img): if isinstance(img, Image.Image): img = np.array(img) assert isinstance(img, np.ndarray), "invalid input 'img' in NormalizeImage" img = (img.astype('float32') * self.scale - self.mean) / self.std if self.channel_num == 4: img_h = img.shape[1] if self.order == 'chw' else img.shape[0] img_w = img.shape[2] if self.order == 'chw' else img.shape[1] pad_zeros = np.zeros( (1, img_h, img_w)) if self.order == 'chw' else np.zeros( (img_h, img_w, 1)) img = (np.concatenate( (img, pad_zeros), axis=0) if self.order == 'chw' else np.concatenate( (img, pad_zeros), axis=2)) return img.astype(self.output_dtype) class ToCHWImage(object): """ convert hwc image to chw image """ def __init__(self): pass def __call__(self, img): if isinstance(img, Image.Image): img = np.array(img) return img.transpose((2, 0, 1)) class ColorJitter(PPColorJitter): """ColorJitter. """ def __init__(self, *args, **kwargs): self.p = kwargs.pop('p', 1.0) super().__init__(*args, **kwargs) def __call__(self, img): if random.random() < self.p: if not isinstance(img, Image.Image): img = np.ascontiguousarray(img) img = Image.fromarray(img) img = super()._apply_image(img) if isinstance(img, Image.Image): img = np.asarray(img) return img class GaussianBlur(object): """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" def __init__(self, sigma=[.1, 2.], p=1.0): self.p = p self.sigma = sigma def __call__(self, img): if random.random() < self.p: if not isinstance(img, Image.Image): img = np.ascontiguousarray(img) img = Image.fromarray(img) sigma = random.uniform(self.sigma[0], self.sigma[1]) img = img.filter(ImageFilter.GaussianBlur(radius=sigma)) if isinstance(img, Image.Image): img = np.asarray(img) return img class Pixels(object): def __init__(self, mode="const", mean=[0., 0., 0.]): self._mode = mode self._mean = mean def __call__(self, h=224, w=224, c=3): if self._mode == "rand": return np.random.normal(size=(1, 1, 3)) elif self._mode == "pixel": return np.random.normal(size=(h, w, c)) elif self._mode == "const": return self._mean else: raise Exception( "Invalid mode in RandomErasing, only support \"const\", \"rand\", \"pixel\"" ) class RandomErasing(object): """RandomErasing. This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm. """ def __init__(self, EPSILON=0.5, sl=0.02, sh=0.4, r1=0.3, mean=[0., 0., 0.], attempt=100, use_log_aspect=False, mode='const'): self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON self.sl = eval(sl) if isinstance(sl, str) else sl self.sh = eval(sh) if isinstance(sh, str) else sh r1 = eval(r1) if isinstance(r1, str) else r1 self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else ( r1, 1 / r1) self.use_log_aspect = use_log_aspect self.attempt = attempt self.get_pixels = Pixels(mode, mean) def __call__(self, img): if random.random() > self.EPSILON: return img for _ in range(self.attempt): area = img.shape[0] * img.shape[1] target_area = random.uniform(self.sl, self.sh) * area aspect_ratio = random.uniform(*self.r1) if self.use_log_aspect: aspect_ratio = math.exp(aspect_ratio) h = int(round(math.sqrt(target_area * aspect_ratio))) w = int(round(math.sqrt(target_area / aspect_ratio))) if w < img.shape[1] and h < img.shape[0]: pixels = self.get_pixels(h, w, img.shape[2]) x1 = random.randint(0, img.shape[0] - h) y1 = random.randint(0, img.shape[1] - w) if img.shape[2] == 3: img[x1:x1 + h, y1:y1 + w, :] = pixels else: img[x1:x1 + h, y1:y1 + w, 0] = pixels[0] return img return img class RandomGrayscale(object): """Randomly convert image to grayscale with a probability of p (default 0.1). Args: p (float): probability that image should be converted to grayscale. Returns: PIL Image: Grayscale version of the input image with probability p and unchanged with probability (1-p). - If input image is 1 channel: grayscale version is 1 channel - If input image is 3 channel: grayscale version is 3 channel with r == g == b """ def __init__(self, p=0.1): self.p = p def __call__(self, img): """ Args: img (PIL Image): Image to be converted to grayscale. Returns: PIL Image: Randomly grayscaled image. """ flag = False if not isinstance(img, Image.Image): img = np.ascontiguousarray(img) img = Image.fromarray(img) flag = True num_output_channels = 1 if img.mode == 'L' else 3 if random.random() < self.p: img = F.to_grayscale(img, num_output_channels=num_output_channels) if flag: img = np.asarray(img) return img ================================================ FILE: ppfleetx/data/transforms/utils.py ================================================ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import preprocess def transform(data, ops=[]): """ transform """ for op in ops: data = op(data) return data def create_preprocess_operators(params): """ create operators based on the config Args: params(list): a dict list, used to create some operators """ assert isinstance(params, list), ('operator config should be a list') ops = [] for operator in params: assert isinstance(operator, dict) and len(operator) == 1, "yaml format error" op_name = list(operator)[0] param = {} if operator[op_name] is None else operator[op_name] op = getattr(preprocess, op_name)(**param) ops.append(op) return ops ================================================ FILE: ppfleetx/data/utils/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .batch_collate_fn import * ================================================ FILE: ppfleetx/data/utils/batch_collate_fn.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import os import sys import numbers import numpy as np from dataclasses import dataclass try: from collections.abc import Sequence, Mapping except: from collections import Sequence, Mapping from ppfleetx.data.sampler import Stack, Tuple def collate_fn(batch): """ Default batch collating function for :code:`paddle.io.DataLoader`, get input data as a list of sample datas, each element in list if the data of a sample, and sample data should composed of list, dictionary, string, number, numpy array and paddle.Tensor, this function will parse input data recursively and stack number, numpy array and paddle.Tensor datas as batch datas. e.g. for following input data: [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, {'image': np.array(shape=[3, 224, 224]), 'label': 3}, {'image': np.array(shape=[3, 224, 224]), 'label': 4}, {'image': np.array(shape=[3, 224, 224]), 'label': 5},] This default collate function zipped each number and numpy array field together and stack each field as the batch field as follows: {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} Args: batch(list of sample data): batch should be a list of sample data. Returns: Batched data: batched each number, numpy array and paddle.Tensor in input data. """ sample = batch[0] if isinstance(sample, np.ndarray): batch = np.stack(batch, axis=0) return batch elif isinstance(sample, paddle.Tensor): return paddle.stack(batch, axis=0) elif isinstance(sample, numbers.Number): batch = np.array(batch) return batch elif isinstance(sample, (str, bytes)): return batch elif isinstance(sample, Mapping): return {key: collate_fn([d[key] for d in batch]) for key in sample} elif isinstance(sample, Sequence): sample_fields_num = len(sample) if not all(len(sample) == sample_fields_num for sample in iter(batch)): raise RuntimeError( "fileds number not same among samples in a batch") return [collate_fn(fields) for fields in zip(*batch)] raise TypeError("batch data con only contains: tensor, numpy.ndarray, " "dict, list, number, but got {}".format(type(sample))) def default_collate_fn(batch_transform=None): if batch_transform is not None: # batch_ops = create_preprocess_operators(batch_transform) # def inner_collate_fn(batch): # batch = transform(batch, batch_ops) # batch = collate_fn(batch) # return batch # return inner_collate_fn pass else: return collate_fn def gpt_collate_fn(batch): return Tuple([Stack() for raw in zip(*batch)])(batch) class ErnieCollateData(): def __init__(self, micro_batch_size=1): self.micro_batch_size = micro_batch_size def generate_data(self, data, stack_fn=Stack()): num_fields = len(data[0]) out = [None] * num_fields # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels for i in (0, 1, 2, 5): out[i] = stack_fn([x[i] for x in data]) out[5] = out[5].reshape([-1, 1]) batch_size, seq_length = out[0].shape size = num_mask = sum(len(x[3]) for x in data) # masked_lm_positions # Organize as a 1D tensor for gather or use gather_nd if size % 8 != 0: size += 8 - (size % 8) out[3] = np.full(size, 0, dtype=np.int32) # masked_lm_labels out[4] = np.full([size, 1], -1, dtype=np.int64) mask_token_num = 0 for i, x in enumerate(data): for j, pos in enumerate(x[3]): out[3][mask_token_num] = i * seq_length + pos out[4][mask_token_num] = x[4][j] mask_token_num += 1 return out def __call__(self, data): accumulate_steps = len(data) // self.micro_batch_size if accumulate_steps == 1: return self.generate_data(data) else: self.micro_batch_size = len(data) // accumulate_steps all_data = [[] for _ in range(6)] for acc_step in range(accumulate_steps): tmp = self.generate_data( data[acc_step * self.micro_batch_size:(acc_step + 1) * self.micro_batch_size]) for i in range(6): all_data[i].append(tmp[i]) return all_data @dataclass class DataCollatorWithPadding: """ Data collator that will dynamically pad the inputs to the longest sequence in the batch. Args: tokenizer_type (str): The type of tokenizer used for encoding the data. """ def __init__(self, tokenizer_type, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors="pd", return_attention_mask=None): from ppfleetx.data.tokenizers import get_ernie_tokenizer self.tokenizer = get_ernie_tokenizer(tokenizer_type) self.padding = padding self.max_length = max_length self.pad_to_multiple_of = pad_to_multiple_of self.return_tensors = return_tensors self.return_attention_mask = return_attention_mask def __call__(self, features): batch = self.tokenizer.pad( features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors=self.return_tensors, return_attention_mask=self.return_attention_mask) if "label" in batch: batch["labels"] = batch["label"] del batch["label"] if "label_ids" in batch: batch["labels"] = batch["label_ids"] del batch["label_ids"] return batch def imagen_collate_fn(samples): """ collate for imagen base64 """ tmp = [] for i in samples: if i and len(i['image']): tmp.append(i) samples = tmp if len(samples) == 0: return None pad_idx = 0 text_items = [sample['caption'] for sample in samples] image_items = [sample['image'] for sample in samples] text_lengths = [len(cap) for cap in text_items] bsz = len(text_items) text_input = text_items image_input = paddle.stack(image_items, axis=0) _input = {'images': image_input, 'texts': text_input} return _input ================================================ FILE: ppfleetx/distributed/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/distributed/apis/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/distributed/apis/amp.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict from types import MethodType import numpy as np import paddle import paddle.nn as nn from paddle import _legacy_C_ops from paddle.fluid.dygraph import to_variable from paddle.fluid import framework from paddle.fluid.dygraph import base as imperative_base from paddle.framework import core from ppfleetx.distributed.apis import env class MixPrecisionLayer(nn.Layer): def __init__(self, layers, dtype="float16"): super().__init__(layers.full_name() + "_mix_precision") self._layers = layers self._dtype = dtype assert self._dtype in ["float16", "bfloat16"] for param in self._layers.parameters(): if not param.stop_gradient and not hasattr(param, "main_grad"): setattr(param, "main_grad", None) param._register_grad_hook(self._update_main_grad_hook(param)) def _update_main_grad_hook(self, param): """Create the update_main_grad hook for backprop.""" # Hook used for back-prop and grad-merge. @paddle.autograd.no_grad() def param_hook(tmp_grad): assert param.grad is None, \ "In main_grad node, param.grad should be None, but find param[{}] has grad.".format(param.name) if param.main_grad is None: param.main_grad = core.eager.Tensor( value=tmp_grad.cast(paddle.float32).value(), place=tmp_grad.place, name="main_grad@" + param.name, ) else: param.main_grad.add_(tmp_grad.cast(paddle.float32)) tmp_grad._clear_data() return None return param_hook def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) return outputs def state_dict( self, destination=None, include_sublayers=True, structured_name_prefix="", ): return self._layers.state_dict( destination=destination, include_sublayers=include_sublayers, structured_name_prefix=structured_name_prefix, ) @framework.deprecate_stat_dict def set_state_dict(self, state_dict, use_structured_name=True): self._layers.set_state_dict( state_dict, use_structured_name=use_structured_name) class MixPrecisionOptimizer: def __init__(self, optimizer): self._inner_opt = optimizer self._parameter_list = self._obtain_optimizer_parameters_list() def _obtain_optimizer_parameters_list(self): if getattr(self._inner_opt, '_param_groups', None) and isinstance( self._inner_opt._param_groups[0], dict): parameters_list = [] for group in self._inner_opt._param_groups: for param in group['params']: parameters_list.append(param) else: parameters_list = [ param for param in self._inner_opt._parameter_list ] return parameters_list @imperative_base.no_grad @framework.dygraph_only def step(self): if not isinstance(self._parameter_list[0], dict): params_grads = [] for param in self._parameter_list: if param.stop_gradient: continue grad_var = param.main_grad if framework.in_dygraph_mode(): if (hasattr(grad_var, "is_selected_rows") and grad_var.is_selected_rows() and self._inner_opt.regularization is not None): raise RuntimeError( "AdamW don't support weight_decay with sparse parameters, please set it to None." ) else: if (hasattr(grad_var, "_is_sparse") and grad_var._is_sparse() and self._inner_opt.regularization is not None): raise RuntimeError( "AdamW don't support weight_decay with sparse parameters, please set it to None." ) params_grads.append((param, grad_var)) optimize_ops = self._inner_opt._apply_optimize( loss=None, startup_program=None, params_grads=params_grads) else: # optimize parameters in groups for param_group in self._inner_opt._param_groups: params_grads = defaultdict(lambda: list()) for param in param_group['params']: if param.stop_gradient: continue grad_var = param.main_grad if framework.in_dygraph_mode(): if (hasattr(grad_var, "is_selected_rows") and grad_var.is_selected_rows() and self._inner_opt.regularization is not None): raise RuntimeError( "AdamW don't support weight_decay with sparse parameters, please set it to None." ) else: if (hasattr(grad_var, "_is_sparse") and grad_var._is_sparse() and self._inner_opt.regularization is not None): raise RuntimeError( "AdamW don't support weight_decay with sparse parameters, please set it to None." ) params_grads['params'].append((param, grad_var)) params_grads.update( {k: v for k, v in param_group.items() if k != 'params'}) self._apply_optimize( loss=None, startup_program=None, params_grads=params_grads) @framework.dygraph_only def clear_grad(self, set_to_zero=True): param_list = [] if self._parameter_list is None or not isinstance( self._parameter_list[0], dict): for p in self._parameter_list: if not p.stop_gradient: param_list.append(p) else: for param_group in self._param_groups: for p in param_group['params']: if not p.stop_gradient: param_list.append(p) for p in param_list: if hasattr(p, "main_grad") and p.main_grad is not None: if set_to_zero: p.main_grad.zero_() else: p.main_grad._clear() p.main_grad = None elif not hasattr(p, "main_grad"): p.clear_gradient(set_to_zero) def __getattr__(self, item): return getattr(self._inner_opt, item) def unscale_method(self, optimizer): if not self._enable: return param_grads = [] if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): for group in optimizer._param_groups: for param in group['params']: if param.main_grad is not None: assert param.main_grad.dtype == core.VarDesc.VarType.FP32 param_grads.append(param.main_grad) else: for param in optimizer._parameter_list: if param.main_grad is not None: assert param.main_grad.dtype == core.VarDesc.VarType.FP32 param_grads.append(param.main_grad) temp_found_inf = to_variable(np.array([0]).astype(np.bool_)) if len(param_grads): _legacy_C_ops.check_finite_and_unscale( param_grads, self._scale, param_grads, temp_found_inf, ) self._found_inf = 1 if temp_found_inf else 0 hcg = env.get_hcg() if hcg is not None and hcg.nranks > hcg.get_data_parallel_world_size(): is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") paddle.distributed.all_reduce( is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) self._found_inf = is_found_inf.numpy()[0] class MixPrecisionScaler: def __init__(self, scaler): self._inner_scaler = scaler self._inner_scaler._unscale = MethodType(unscale_method, scaler) def __getattr__(self, item): return getattr(self._inner_scaler, item) ================================================ FILE: ppfleetx/distributed/apis/comm_groups.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.distributed as dist from paddle.distributed import fleet from paddle.distributed.fleet.base.strategy_group import ( StrategyGroupBase, DPGroup, MPGroup, PPGroup, ShardingGroup, ) from paddle.distributed.fleet.base.orthogonal_strategy import OrthogonalStrategy def create_hcg(strategy, hcg_name): if hcg_name == "HybridCommunicateGroup": fleet.init(is_collective=True, strategy=strategy) hcg = fleet.get_hybrid_communicate_group() else: dist.init_parallel_env() hcg = eval("{}".format(hcg_name))(strategy) return hcg class MoEGroup(StrategyGroupBase): """ The communication group strategy for expert parallel. Args: list_of_ranks: A 2D-array, such as `[[0, 1, 2, 3], [4, 5, 6, 7]]`. Ranks in sublist represents they are in the same communication group. Returns: The instance of expert parallel strategy group. """ def __init__(self, list_of_ranks): super(MoEGroup, self).__init__(list_of_ranks) assert not isinstance( self.group, list), "Rank {} belongs to multi moe groups".format(self._rank) class Hybrid4DCommGroup(OrthogonalStrategy): def __init__(self, list_of_strategy=None, fused_strategy_dict={}): list_of_strategy = [ ("dp", 1, DPGroup), ("mp", 1, MPGroup), ("pp", 1, PPGroup), ("sharding", 1, ShardingGroup), ] if list_of_strategy is None else list_of_strategy fused_strategy_dict["check"] = ["mp", "pp"] super().__init__(list_of_strategy, fused_strategy_dict) # data parallel def get_data_parallel_rank(self): return self.rank_in_strategy("dp") def get_data_parallel_world_size(self): return self.strategy_group("dp").world_size def get_data_parallel_group(self): return self.strategy_group("dp").group def get_data_parallel_group_src_rank(self): return self.strategy_group("dp").group.ranks[0] # tensor parallel def get_model_parallel_rank(self): return self.rank_in_strategy("mp") def get_model_parallel_world_size(self): return self.strategy_group("mp").world_size def get_model_parallel_group(self): return self.strategy_group("mp").group def get_model_parallel_group_src_rank(self): return self.strategy_group("mp").group.ranks[0] # pipeline parallel def get_stage_id(self): return self.rank_in_strategy("pp") def get_pipe_parallel_world_size(self): return self.strategy_group("pp").world_size def get_pipe_parallel_group(self): return self.strategy_group("pp").group def get_p2p_groups(self): return (self.strategy_group("pp").p2p_groups) # group sharded parallel def get_sharding_parallel_rank(self): return self.rank_in_strategy("sharding") def get_sharding_parallel_world_size(self): return self.strategy_group("sharding").world_size def get_sharding_parallel_group(self): return self.strategy_group("sharding") def get_sharding_parallel_group_src_rank(self): return self.strategy_group("sharding").ranks[0] # check parallel group def get_check_parallel_group(self): return self.strategy_group("check").group class HybridCommGroupForMoE(Hybrid4DCommGroup): def __init__(self, strategy): self._dp_degree = strategy.hybrid_configs.get("dp_degree", 1) self._mp_degree = strategy.hybrid_configs.get("mp_degree", 1) self._pp_degree = strategy.hybrid_configs.get("pp_degree", 1) self._sharding_degree = strategy.hybrid_configs.get("sharding_degree", 1) assert self._pp_degree == 1, "The strategy combination of moe and pp \ has not been supported in ppfleetx right now." assert self._sharding_degree == 1, "The strategy combination of moe and sharding \ has not been supported in ppfleetx right now." list_of_strategy = [ ("dp", self._dp_degree, DPGroup), ("mp", self._mp_degree, MPGroup), ("pp", self._pp_degree, PPGroup), ("sharding", self._sharding_degree, ShardingGroup), ] fused_strategy_dict = {"moe": ["dp", "mp"]} super().__init__(list_of_strategy, fused_strategy_dict) def get_expert_parallel_world_size(self): return self.fused_strategy_group("moe").world_size def get_expert_parallel_group(self): return self.fused_strategy_group("moe").group ================================================ FILE: ppfleetx/distributed/apis/env.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import random import numpy as np import paddle import paddle.distributed as dist from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from ppfleetx.utils.log import logger from ppfleetx.distributed.apis import comm_groups __all__ = ['init_dist_env'] _seed = None _dp_seed = None _hcg = None def set_seed(seed): # NOTE(shenliang03): For parameter init seed: # seed: dp/mp_undistributed_paramter/sharding is same; others is different # For compute seed(dropout): # global seed: only mp group is same. # local seed: all groups are different if dist.get_world_size() > 1: # obtain rank message of hybrid parallel hcg = get_hcg() mp_rank = hcg.get_model_parallel_rank() mp_size = hcg.get_model_parallel_world_size() pp_rank = hcg.get_stage_id() pp_size = hcg.get_pipe_parallel_world_size() dp_rank = hcg.get_data_parallel_rank() dp_size = hcg.get_data_parallel_world_size() sharding_rank = hcg.get_sharding_parallel_rank() sharding_size = hcg.get_sharding_parallel_world_size() else: mp_rank, mp_size = 0, 1 pp_rank, pp_size = 0, 1 dp_rank, dp_size = 0, 1 sharding_rank, sharding_size = 0, 1 # NOTE: the commented seeds are set only for precision validation # seed += 100 * pp_rank random.seed(seed + 100 * pp_rank) np.random.seed(seed + 100 * pp_rank) # seed = mp_rank + # pp_rank * (mp_size) + # dp_rank * (mp_size * pp_size) + # sharding_rank * (mp_size * pp_size * dp_size) # seed offset is order to avoid conflicts with the parameter initialization seed seed_offset = seed + 1024 + paddle.distributed.get_world_size() global_seed = seed_offset + \ pp_rank * (mp_size) + \ dp_rank * (mp_size * pp_size) + \ sharding_rank * (mp_size * pp_size * dp_size) seed_offset += paddle.distributed.get_world_size() local_seed = seed_offset + \ mp_rank + \ pp_rank * (mp_size) + \ dp_rank * (mp_size * pp_size) + \ sharding_rank * (mp_size * pp_size * dp_size) tracker = get_rng_state_tracker() tracker.add('global_seed', global_seed) tracker.add('local_seed', local_seed) paddle.seed(global_seed) logger.info("The global seed is set to {} and local seed is set to {}.". format(global_seed, local_seed)) global _seed global _dp_seed _seed = seed _dp_seed = global_seed def set_hcg(hcg): global _hcg _hcg = hcg def get_hcg(): global _hcg return _hcg def get_seed(): global _seed return _seed def get_dp_seed(): global _dp_seed return _dp_seed def init_dist_env(config): paddle.set_device(config.Global.device) strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": config.Distributed.dp_degree, "mp_degree": config.Distributed.mp_degree, "pp_degree": config.Distributed.pp_degree, "sharding_degree": config.Distributed.sharding.sharding_degree, } if config.Distributed.pp_degree > 1: if 'sequence_parallel' in config.Model: if config.Model.sequence_parallel: assert config.Global.enable_partial_send_recv is False, \ "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \ "config.Global.enable_partial_send_recv should be set False." strategy.pipeline_configs = { "accumulate_steps": config.Global.local_batch_size // config.Global.micro_batch_size, "micro_batch_size": config.Global.micro_batch_size, "enable_partial_send_recv": config.Global.enable_partial_send_recv, } # set control in tensor parallel seed = config.Global.seed strategy.tensor_parallel_configs = {"tensor_init_seed": seed} hcg = comm_groups.create_hcg(strategy, hcg_name=config.Distributed.hcg) set_hcg(hcg) def get_local_rank(): return int(os.getenv("PADDLE_RANK_IN_NODE", 0)) def get_data_world_size(): if paddle.distributed.get_world_size() == 1: return 1 hcg = get_hcg() dp_size = hcg.get_data_parallel_world_size() sharding_size = hcg.get_sharding_parallel_world_size() return dp_size * sharding_size def get_data_world_rank(): if paddle.distributed.get_world_size() == 1: return 0 hcg = get_hcg() dp_rank = hcg.get_data_parallel_rank() sharding_rank = hcg.get_sharding_parallel_rank() sharding_size = hcg.get_sharding_parallel_world_size() return dp_rank * sharding_size + sharding_rank def work_at_local_rank0(func): def wrapper(*args, **kwargs): local_rank = 0 if paddle.fluid.core.is_compiled_with_dist( ) and paddle.distributed.get_world_size() > 1: local_rank = paddle.distributed.ParallelEnv().dev_id if local_rank == 0: func(*args, **kwargs) return wrapper ================================================ FILE: ppfleetx/distributed/apis/io.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import random import numpy as np import paddle import paddle.distributed as dist from paddle.distributed import fleet from paddle.incubate.distributed.utils.io import save_for_auto_inference from ppfleetx.utils.log import logger from ppfleetx.distributed.apis import env def save(output_dir, model, optimizer=None, step=0, epoch=0, sharding_stage=2): """ save the state dicts of model and optimizer into an checkpoint. """ nranks = dist.get_world_size() if nranks > 1: hcg = env.get_hcg() dp_rank = hcg.get_data_parallel_rank() mp_rank = hcg.get_model_parallel_rank() pp_rank = hcg.get_stage_id() sharding_rank = hcg.get_sharding_parallel_rank() else: dp_rank = 0 if dp_rank != 0: logger.info("DP_Rank %d doesn't save model" % dp_rank) return if output_dir and isinstance(output_dir, str): output_dir = os.path.join(output_dir, "epoch_%d_step_%d" % (epoch, step)) if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) logger.info("Save model to %s" % output_dir) save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( output_dir, mp_rank, sharding_rank, pp_rank) if nranks > 1 else output_dir if sharding_stage == 3: model.get_all_parameters(convert2cpu=False) paddle.save(model.state_dict(), os.path.join(save_dir, "model.pdparams")) if optimizer is not None: paddle.save(optimizer.state_dict(), os.path.join(save_dir, "model_state.pdopt")) meta_dict = { "epoch": epoch, "step": step, "cuda_rng_state": paddle.get_cuda_rng_state() } paddle.save(meta_dict, os.path.join(save_dir, "meta_state.pdopt")) save_auto_dir = os.path.join(output_dir, "auto_infer") save_for_auto_inference(os.path.join(save_auto_dir, "auto"), model) else: raise TypeError("`save` requires a valid value of `output_dir`.") def load(ckpt_dir, model, optimizer=None, mode='train', load_recovery=None): nranks = dist.get_world_size() if nranks > 1: hcg = env.get_hcg() dp_rank = hcg.get_data_parallel_rank() mp_rank = hcg.get_model_parallel_rank() pp_rank = hcg.get_stage_id() sharding_rank = hcg.get_sharding_parallel_rank() else: dp_rank = 0 load_recovery = {} if load_recovery is None else load_recovery if ckpt_dir and isinstance(ckpt_dir, str): logger.info("Try to load checkpoint from %s " % ckpt_dir) if mode == 'quant': load_dir = ckpt_dir else: load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}".format( ckpt_dir, mp_rank, sharding_rank, pp_rank) if nranks > 1 else ckpt_dir model_path = os.path.join(load_dir, "model.pdparams") opt_path = os.path.join(load_dir, "model_state.pdopt") meta_path = os.path.join(load_dir, "meta_state.pdopt") if os.path.exists(model_path): model_dict = paddle.load(model_path) for name, param in model.state_dict().items(): assert name in model_dict.keys( ), "No param named `{}` was found in checkpoint file.".format( name) if param.dtype != model_dict[name].dtype: model_dict[name] = model_dict[name].cast(param.dtype) model.set_state_dict(model_dict) else: raise ValueError("No model checkpoint file found in %s." % model_path) if mode == 'train': if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) else: raise ValueError("No optimizer checkpoint file found in %s." % opt_path) if os.path.exists(meta_path): meta_dict = paddle.load(meta_path) load_recovery.update({ 'step': meta_dict['step'], 'epoch': meta_dict['epoch'], 'rng_state': meta_dict['cuda_rng_state'] }) else: raise ValueError("No meta checkpoint file found in %s." % meta_path) logger.info("successfully load checkpoints") else: logger.warning("`load` requires a valid value of `ckpt_dir`.") raise TypeError("`load` requires a valid value of `ckpt_dir`.") ================================================ FILE: ppfleetx/distributed/apis/strategy.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.distributed as dist import paddle.distributed.fleet as fleet from paddle.distributed.parallel import sync_params_buffers from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from paddle.distributed.fleet.meta_parallel import TensorParallel from paddle.distributed.sharding import group_sharded_parallel from ppfleetx.distributed.apis import env, amp from ppfleetx.utils.tensor_fusion_helper import all_reduce_parameters def wrap_with_fleet(dist_config, model, optimizer=None, scaler=None): if dist_config.sharding.sharding_stage in [2, 3]: assert dist_config.pp_degree == 1, \ "sharding stage2/3 will support pipeline parallel later" return wrap_sharding_2_3(dist_config, model, optimizer, scaler) else: return wrap_3D_parallel(dist_config, model, optimizer, scaler) def wrap_sharding_2_3(dist_config, model, optimizer=None, scaler=None): hcg = env.get_hcg() dp_group = hcg.get_data_parallel_group() sharding_group = hcg.get_sharding_parallel_group() if dist_config.dp_degree > 1 and dist_config.sharding.sharding_stage == 3: sync_params_buffers( model, comm_group=dp_group, src_rank=dp_group.ranks[0]) if dist_config.mp_degree > 1: assert dist_config.sharding.sharding_stage == 2, "only support mp + sharding stage2 hybrid parallel now." model = TensorParallel(model, hcg, strategy=None) level = "p_g_os" if dist_config.sharding.sharding_stage == 3 else "os_g" origin_model = model model, optimizer, scaler = group_sharded_parallel( model=model, optimizer=optimizer, level=level, scaler=scaler, group=sharding_group, offload=dist_config.sharding.sharding_offload, dp_group=dp_group if dp_group.nranks > 1 else None) if dist_config.sharding.reduce_overlap: model._set_reduce_overlap(dist_config.sharding.reduce_overlap) if dist_config.sharding.broadcast_overlap: optimizer._set_broadcast_overlap( dist_config.sharding.broadcast_overlap, layers=origin_model, num_groups=2) return model, optimizer, scaler def wrap_3D_parallel(dist_config, model, optimizer=None, scaler=None): hcg = env.get_hcg() dp_group = hcg.get_data_parallel_group() if isinstance(model, amp.MixPrecisionLayer): if dist.get_world_size() == dist_config.dp_degree: sync_params_buffers( model, comm_group=dp_group, src_rank=dp_group.ranks[0]) elif dist_config.pp_degree > 1: model = fleet.distributed_model(model._layers) else: model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer( optimizer) if optimizer is not None else optimizer scaler = fleet.distributed_scaler(scaler) if scaler is not None else scaler return model, optimizer, scaler ================================================ FILE: ppfleetx/distributed/protein_folding/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . scg import scg ================================================ FILE: ppfleetx/distributed/protein_folding/bp.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Branch Parallel helper function""" import paddle from paddle.autograd import PyLayer from . import scg __all__ = [ 'get_world_size', 'get_rank_in_group', ] def get_world_size(): nranks = 1 if hasattr(scg, "bp_group"): nranks = scg.bp_group.nranks return nranks def get_rank_in_group(): rank = 0 if hasattr(scg, "get_rank_in_bp_group"): rank = scg.get_rank_in_bp_group() return rank @paddle.no_grad() def broadcast(tensor, src): """ broadcast tensor from src rank in bp group """ if get_world_size() == 1: return tensor assert src in [0, 1], "Branch Parallel is only support bp_degree=2 now!" group = scg.bp_group task = group.process_group.broadcast(tensor, src) task.wait() return tensor class BroadcastGrad(PyLayer): """ A PyLayer Op broadcast gradient in backward stage """ @staticmethod def forward(ctx, input, src): """ return input directly """ ctx.src = src return input.clone() @staticmethod def backward(ctx, grad_output): """ broadcast grad form src """ broadcast(grad_output, ctx.src) return grad_output.clone() def broadcast_grad_for_backward(input, src): """ a warpper for boradcast gradient in backward stage """ if get_world_size() == 1: return input if not input.stop_gradient: output = BroadcastGrad.apply(input, src) else: output = input.clone() return output @paddle.no_grad() def all_reduce(tensor): """ allreduce a tensor in bp group """ if get_world_size() == 1: return tensor group = scg.bp_group paddle.distributed.all_reduce( tensor, sync_op=True, group=group) return tensor class SyncEvoformerResults(PyLayer): """ A PyLayer Op broadcast gradient in backward stage """ @staticmethod def forward(ctx, outer, msa, pair): broadcast(outer, 0) if get_rank_in_group() == 1: pair += outer broadcast(pair, 1) broadcast(msa, 0) return msa, pair @staticmethod def backward(ctx, *grad_output): msa_grad = grad_output[0] pair_grad = grad_output[1] if get_rank_in_group() == 0: pair_grad = paddle.zeros_like(pair_grad) outer_grad = pair_grad.clone() broadcast(outer_grad, 1) return outer_grad, msa_grad, pair_grad def sync_evoformer_results(outer, msa, pair): """ a warpper for boradcast gradient in backward stage """ if get_world_size() == 1: return msa, pair if outer.stop_gradient and msa.stop_gradient and pair.stop_gradient: return msa, pair msa, pair = SyncEvoformerResults.apply(outer, msa, pair) return msa, pair @paddle.no_grad() def grad_sync(param_groups): """ sync the gradients of params """ nranks = get_world_size() if nranks < 2: return comm_group = scg.bp_group for group in param_groups: if group.get("bp", False): for p in group['params']: if p.is_distributed: continue grad = p.grad if grad is None: continue paddle.distributed.all_reduce( grad, sync_op=True, group=comm_group) return None ================================================ FILE: ppfleetx/distributed/protein_folding/dap.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Dynamic Axial Parallelism and Duality Async Operation helper functions paper ref: FastFold: Reducing AlphaFold Training Time from 11 Days to 67 Hours, https://arxiv.org/abs/2203.00854 code ref: https://github.com/hpcaitech/FastFold.git """ import warnings import time import paddle from paddle import nn from paddle import distributed as dist from paddle.autograd import PyLayer from . import scg __all__ = [ 'set_dap_sync_op', 'get_dap_sync_op', 'get_world_size', 'get_rank_in_group', 'scatter', 'gather', 'all_gather', 'all_gather_opp', 'all_to_all', 'all_to_all_opp', 'row_to_col', 'col_to_row' ] _sync_op = True def set_dap_sync_op(sync_op): assert sync_op in [True, False] assert sync_op is True, "Only support sync mode now!" global _sync_op _sync_op = sync_op def get_dap_sync_op(): global _sync_op return _sync_op def get_world_size(): nranks = 1 if hasattr(scg, "dap_group"): nranks = scg.dap_group.nranks return nranks def get_rank_in_group(): rank = 0 if hasattr(scg, "get_rank_in_dap_group"): rank = scg.get_rank_in_dap_group() return rank def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" assert numerator % denominator == 0, '{} is not divisible by {}'.format( numerator, denominator) def divide(numerator, denominator): ensure_divisibility(numerator, denominator) return numerator // denominator @paddle.no_grad() def _all_gather(tensor, axis=-1, sync_op=True): group = scg.dap_group tensor_shape = list(tensor.shape) tensor_shape[0] *= group.nranks out = paddle.zeros(tensor_shape, tensor.dtype) out.stop_gradient = tensor.stop_gradient task = group.process_group.all_gather(tensor, out) task.wait() return out @paddle.no_grad() def _gather(tensor, axis=-1): output = _all_gather(tensor) if axis != 0: output = paddle.concat( paddle.split( output, get_world_size(), axis=0), axis=axis) return output @paddle.no_grad() def _split(tensor, axis=-1): ensure_divisibility(tensor.shape[axis], get_world_size()) tensor_list = paddle.split(tensor, get_world_size(), axis=axis) output = tensor_list[get_rank_in_group()] return output class Scatter(PyLayer): """ Scatter PyLayer Op""" @staticmethod def forward(ctx, input, axis: -1): ctx.axis = axis return _split(input, axis=axis) @staticmethod def backward(ctx, grad_output): return _gather(grad_output, axis=ctx.axis) def scatter(input, axis=-1): """ split a tensor according axis by dap size """ if get_world_size() == 1: return input if not input.stop_gradient: output = Scatter.apply(input, axis=axis) else: output = _split(input, axis=axis) return output class Gather(PyLayer): """ Gather PyLayer Op """ @staticmethod def forward(ctx, input, axis=-1): ctx.axis = axis return _gather(input, axis=axis) @staticmethod def backward(ctx, grad_output): return _split(grad_output, axis=ctx.axis) def gather(input, axis=-1): """ gather tensor form all rank in dap group in axis """ if get_world_size() == 1: return input if not input.stop_gradient: output = Gather.apply(input, axis=axis) else: output = _gather(input, axis=axis) return output @paddle.no_grad() def _reduce_scatter(tensor, sync_op=True): group = scg.dap_group tensor_shape = list(tensor.shape) tensor_shape[0] = divide(tensor_shape[0], group.nranks) output = paddle.zeros(tensor_shape, tensor.dtype) output.stop_gradient = tensor.stop_gradient dist.stream.reduce_scatter( output, tensor, op=dist.ReduceOp.SUM, group=group, sync_op=True) return output class AllGather(PyLayer): """ AllGather PyLayer Op """ @staticmethod def forward(ctx, input, axis=-1, sync_op=True): ctx.axis = axis ctx.sync_op = sync_op output = _all_gather(input, axis=axis, sync_op=sync_op) return output @staticmethod def backward(ctx, grad_output): if not ctx.sync_op: pass # TODO(GuoxiaWang): implement wait logical return grad_output class AllGather_Opp(PyLayer): """ Duality Async Operation for AllGather """ @staticmethod def forward(ctx, input, axis=-1, sync_op=True): ctx.axis = axis ctx.sync_op = sync_op return input @staticmethod def backward(ctx, grad_output): output = _reduce_scatter(grad_output, sync_op=ctx.sync_op) return output def all_gather(input, axis=-1): """ gather tensors from all rank in dap group and all get the result. if sync_op=None, sync will be assign according init_dap setting. when using async communication, sync_op=False, do not use the output as same as input. E.g. do not use `a = all_gather(a, ...)`, recommend to use `b = all_gather(a, ...)` """ if get_world_size() == 1: return input sync_op = get_dap_sync_op() if not input.stop_gradient: output = AllGather.apply(input, axis, sync_op=sync_op) else: output = _all_gather(input, axis, sync_op=sync_op) return output def all_gather_opp(output, axis=-1): """ Duality Async Operation for all_gather. if sync_op=None, sync will be assign according init_dap setting. """ nranks = get_world_size() if nranks == 1: return output sync_op = get_dap_sync_op() if not sync_op: # TODO(GuoxiaWang): implement wait logical pass if not output.stop_gradient: output = AllGather_Opp.apply(output, axis, sync_op=sync_op) if axis != 0: output = paddle.concat(paddle.split(output, nranks, 0), axis=axis) return output @paddle.no_grad() def _all_to_all(tensor, in_axis=-1, out_axis=-1, sync_op=True): group = scg.dap_group tensor_shape = list(tensor.shape) out = paddle.zeros(tensor_shape, tensor.dtype) out.stop_gradient = tensor.stop_gradient task = group.process_group.alltoall(tensor, out) task.wait() return out class All_to_All(PyLayer): """ All_to_All PyLayer Op""" @staticmethod def forward(ctx, input, in_axis=-1, out_axis=-1, sync_op=True): ctx.in_axis = in_axis ctx.out_axis = out_axis ctx.sync_op = sync_op return _all_to_all( input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op) @staticmethod def backward(ctx, grad_output): if not ctx.sync_op: # TODO(GuoxiaWang): implement wait logical pass return grad_output class All_to_All_Opp(PyLayer): """ Duality Async Operation for All_to_All """ @staticmethod def forward(ctx, output, in_axis=-1, out_axis=-1, sync_op=True): ctx.in_axis = in_axis ctx.out_axis = out_axis ctx.sync_op = sync_op return output @staticmethod def backward(ctx, grad_output): return _all_to_all( grad_output, in_axis=ctx.out_axis, out_axis=ctx.in_axis, sync_op=ctx.sync_op) def all_to_all(input, in_axis, out_axis): """ all to all according in_axis and out_axis. if sync_op=None, sync will be assign according init_dap setting. """ if get_world_size() == 1: return input sync_op = get_dap_sync_op() if in_axis != 0: ensure_divisibility(input.shape[in_axis], get_world_size()) input = paddle.concat( paddle.split( input, get_world_size(), axis=in_axis), axis=0) if not input.stop_gradient: output = All_to_All.apply( input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op) else: output = _all_to_all( input, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op) return output def all_to_all_opp(output, in_axis, out_axis): """ Duality Async Operation for all_to_all. if sync_op=None, sync will be assign according init_dap setting. """ if get_world_size() == 1: return output sync_op = get_dap_sync_op() if not sync_op: # TODO(GuoxiaWang): implement wait logical pass if not output.stop_gradient: output = All_to_All_Opp.apply( output, in_axis=in_axis, out_axis=out_axis, sync_op=sync_op) if out_axis != 0: ensure_divisibility(output.shape[0], get_world_size()) output = paddle.concat( paddle.split( output, get_world_size(), axis=0), axis=out_axis) return output class All2All(PyLayer): @staticmethod def forward(ctx, input, in_axis=-1, out_axis=-1): ctx.in_axis = in_axis ctx.out_axis = out_axis return _all_to_all(input, in_axis=in_axis, out_axis=out_axis) @staticmethod def backward(ctx, grad_output): return _all_to_all( grad_output, in_axis=ctx.out_axis, out_axis=ctx.in_axis) def row_to_col(input): """ N, S, R, C => N, R, S, C using sync all_to_all """ if get_world_size() == 1: return input ensure_divisibility(input.shape[2], get_world_size()) input = paddle.concat( paddle.split( input, get_world_size(), axis=2), axis=0) if not input.stop_gradient: output = All2All.apply(input, in_axis=2, out_axis=1) else: output = _all_to_all(input, in_axis=2, out_axis=1) output = paddle.concat( paddle.split( output, get_world_size(), axis=0), axis=1) return output def col_to_row(input): """ N, R, S, C => N, S, R, C using sync all_to_all """ if get_world_size() == 1: return input ensure_divisibility(input.shape[1], get_world_size()) input = paddle.concat( paddle.split( input, get_world_size(), axis=1), axis=0) if not input.stop_gradient: output = All2All.apply(input, in_axis=1, out_axis=2) else: output = _all_to_all(input, in_axis=1, out_axis=2) output = paddle.concat( paddle.split( output, get_world_size(), axis=0), axis=2) return output @paddle.no_grad() def grad_sync(param_groups): """ sync the gradients of params """ nranks = get_world_size() if nranks < 2: return comm_group = scg.dap_group for group in param_groups: if group.get("dap", False): for p in group['params']: if p.is_distributed: continue grad = p.grad if grad is None: continue paddle.distributed.all_reduce( grad, sync_op=True, group=comm_group) return None ================================================ FILE: ppfleetx/distributed/protein_folding/dp.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Distributed Data Parallel helper functions """ import paddle from . import scg __all__ = [ 'get_world_size', 'get_rank_in_group', 'grad_sync', 'param_sync' ] def get_world_size(): nranks = 1 if hasattr(scg, "dp_group"): nranks = scg.dp_group.nranks return nranks def get_rank_in_group(): rank = 0 if hasattr(scg, "get_rank_in_dp_group"): rank = scg.get_rank_in_dp_group() return rank @paddle.no_grad() def grad_sync(param_groups, grad_avg=True): """ sync the gradients of params """ nranks = get_world_size() if nranks < 2: return comm_group = scg.dp_group for group in param_groups: for p in group['params']: if p.is_distributed: continue grad = p.grad if grad is None: continue paddle.distributed.all_reduce( grad, sync_op=True, group=comm_group) if grad_avg: grad = p.grad.scale_(1.0 / nranks) return None @paddle.no_grad() def param_sync(model, src_rank=0, comm_group=None): """ broadcast params to other ranks """ nranks = paddle.distributed.get_world_size( ) if comm_group is None else comm_group.nranks if nranks < 2: return for _, param in model._obtain_parameters_buffers().items(): if param.is_distributed: continue if getattr(param, "no_sync", False): continue paddle.distributed.broadcast( param, src=src_rank, group=comm_group, sync_op=True) return None @paddle.no_grad() def all_reduce(tensor, op=paddle.distributed.ReduceOp.SUM): """ allreduce a tensor in bp group """ if get_world_size() == 1: return tensor group = scg.dp_group paddle.distributed.all_reduce( tensor, sync_op=True, op=op, group=group) return tensor ================================================ FILE: ppfleetx/distributed/protein_folding/scg.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Communication group manager """ import types import numpy as np from paddle import distributed as dist def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" assert numerator % denominator == 0, '{} is not divisible by {}'.format( numerator, denominator) class SingletonCommunicationGroup(object): """ A singleton communication group for hybrid parallel. """ def __init__(self): self.initialized = False def init_process_group(self, parallel_degree=[('dp', None)], custom_parallel_degree=None): """ init the hybrid parallel process group. In most cases, only one hybrid parallel process group is initialized in a distributed program, so this is a singleton design. args: parallel_degree(list of tuple): Each parallel strategy consists of a tuple. E.g. [('dp', None), ('pp', 2), ('mp', 2)], means that the data parallel degree is obtained by calculation, the pipeline parallel degree is 2, and the model parallel degree is 2. For data parallelism, it is special. It is assumed that data parallelism has always been in the outermost dimension. If it is not set, the data parallelism degree will be automatically calculated. When multiple distributed strategies fully overlap, this can be represented by setting multiple parallel names in a tuple. For example, [('dp', None), ('mp', 'bp', 2)]. Default is [('dp', None)] custom_parallel_degree(list of tuple): Higher-order usages can be used when the automatically derived parallel strategy fails to meet user needs. The user can calculate the rank id in the communication group and pass it in through the `custom_parallel_degree` arg. Default is None. E.g. [('dp', [[0, 2, 4, 6], [1, 3, 5, 7]]), ('mp', 'bp', [[0, 1], [2, 3], [4, 5], [6, 7]])] note: `parallel_degree` and `custom_parallel_degree` are mutually exclusive, only one can be set at the same time. example 1: # 8 gpus on single node, dp will be 2 # dp_group_ranks = [[0, 4], [1, 5], [2, 6], [3, 7]] # pp_group_ranks = [[0, 2], [1, 3], [4, 6], [5, 7]] # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]] scg = SingletonCommunicationGroup() scg.init_process_group(parallel_degree=[('dp', None), ('pp', 2), ('mp', 2)]) print(scg.dp_group) print(scg.get_rank_in_bp_group()) print(scg.get_dp_world_size()) example 2: # 8 gpus on single node, dp will be 2 # dp_group_ranks = [[0, 4], [1, 5], [2, 6], [3, 7]] # pp_group_ranks = [[0, 2], [1, 3], [4, 6], [5, 7]] # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]] scg = SingletonCommunicationGroup() scg.init_process_group(parallel_degree=[('pp', 2), ('mp', 2)]) example 3: # 8 gpus on single node, dp will be 4, mp and bp share a communication group. # dp_group_ranks = [[0, 2, 4, 6], [1, 3, 5, 7]] # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]] # bp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]] scg = SingletonCommunicationGroup() scg.init_process_group(parallel_degree=[('dp', None), ('mp', 'bp', 2)]) example 4: # 8 gpus on single node, dp will be 8, mp will be 8, dp and mp share a communication group. # dp_group_ranks = [[0, 1, 2, 3, 4, 5, 6, 7]] # mp_group_ranks = [[0, 1, 2, 3, 4, 5, 6, 7]] scg = SingletonCommunicationGroup() scg.init_process_group(parallel_degree=[('dp', 'mp', 8)]) example 5: # Equal to example 3 but pass config by custom_parallel_degree. # dp_group_ranks = [[0, 2, 4, 6], [1, 3, 5, 7]] # mp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]] # bp_group_ranks = [[0, 1], [2, 3], [4, 5], [6, 7]] scg = SingletonCommunicationGroup() scg.init_process_group(parallel_degree=None, custom_parallel_degree=[('dp', [[0, 2, 4, 6], [1, 3, 5, 7]]), ('mp', 'bp', [[0, 1], [2, 3], [4, 5], [6, 7]])]) """ assert not (parallel_degree is not None and custom_parallel_degree is not None), \ f"parallel_degree and custom_parallel_degree only can be set one." assert self.initialized == False, "Communication group is already initialized!" if dist.is_initialized() is not None: dist.init_parallel_env() world_size = dist.get_world_size() rank = dist.get_rank() # parse parallel_degree if parallel_degree is not None and custom_parallel_degree is None: def check_valid(inp): assert isinstance( inp, list), f"parallel_degree must be list of tuple" for item in inp: num_ele = len(item) assert num_ele >= 2, f"each item in parallel_degree must has least two element." assert isinstance(item[-1], ( int, type(None) )), f"the last element in each item must be int or None" for idx in range(num_ele - 1): assert isinstance(item[idx], str) check_valid(parallel_degree) dp_exist = False dp_has_set = False num_ranks = 1 for idx, item in enumerate(parallel_degree): degree = item[-1] if 'dp' in item: assert idx == 0, 'The data parallel dimension must be the outermost dimension.' dp_exist = True if degree is not None: dp_has_set = True else: degree = 1 assert degree is not None, 'All but dp must specify the parallel degree explicitly.' num_ranks *= degree # check and update dp if not dp_exist: assert world_size % num_ranks == 0, 'The total number of parallelism products set is not divisible by the total number of cards.' parallel_degree.insert(0, ('dp', world_size // num_ranks)) elif dp_exist and not dp_has_set: assert world_size % num_ranks == 0, 'The total number of parallelism products set is not divisible by the total number of cards.' parallel_degree[0] = ('dp', world_size // num_ranks) else: assert num_ranks == world_size, 'The total number of parallelism products set is not equal to the total number of cards.' degrees = tuple([item[-1] for item in parallel_degree]) num_parallel = len(parallel_degree) group_arr = np.arange(0, world_size).reshape(degrees) custom_parallel_degree = [] for idx, item in enumerate(parallel_degree): parallel_name = item[0] degree = item[-1] transpose_axes = [] for axis in range(num_parallel): if axis != idx: transpose_axes.append(axis) transpose_axes.append(idx) arr = group_arr.transpose(transpose_axes).reshape((-1, degree)) custom_parallel_degree.append([]) for parallel_name in item[:-1]: custom_parallel_degree[idx].append(parallel_name) custom_parallel_degree[idx].append([]) for i in range(world_size // degree): ranks = arr[i].tolist() custom_parallel_degree[idx][-1].append(ranks) custom_parallel_degree[idx] = tuple(custom_parallel_degree[ idx]) else: print( "We do not check the validity of user-defined custom_parallel_degree." ) # new group and set attr for item in custom_parallel_degree: ranks_list = item[-1] for i in range(len(ranks_list)): ranks = ranks_list[i] for parallel_name in item[:-1]: group = dist.new_group(ranks) print(f'> {parallel_name} ranks: {ranks}') if rank in ranks: setattr(self, f'{parallel_name}_group', group) def get_rank_in_group(parallel_name): def func(): if not self.initialized: return -1 group = getattr(self, f'{parallel_name}_group') return group.get_group_rank(dist.get_rank()) return func setattr(self, f'get_rank_in_{parallel_name}_group', get_rank_in_group(parallel_name)) def get_group_world_size(parallel_name): def func(): if not self.initialized: return -1 group = getattr(self, f'{parallel_name}_group') return group.nranks return func setattr(self, f'get_{parallel_name}_world_size', get_group_world_size(parallel_name)) self.initialized = True scg = SingletonCommunicationGroup() ================================================ FILE: ppfleetx/models/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import copy from ppfleetx.core.module.basic_module import BasicModule from ppfleetx.models.language_model.language_module import GPTModule, GPTGenerationModule, GPTEvalModule, GPTFinetuneModule from ppfleetx.models.language_model.gpt.auto.auto_module import GPTModuleAuto, GPTGenerationModuleAuto from ppfleetx.models.vision_model.general_classification_module import GeneralClsModule, GeneralClsModuleAuto from ppfleetx.models.vision_model.moco_module import MOCOModule, MOCOClsModule from ppfleetx.models.multimodal_model.multimodal_module import ImagenModule from ppfleetx.models.language_model.ernie import ErnieModule, ErnieSeqClsModule, ErnieModuleAuto, ErnieSeqClsModuleAuto from ppfleetx.models.language_model.language_module import MoEModule from ppfleetx.models.multimodal_model.multimodal_module import ImagenModule def build_module(config): module_name = config.Model.get("module", "BasicModule") module = eval(module_name)(config) return module ================================================ FILE: ppfleetx/models/language_model/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/auto_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import numpy as np import paddle.distributed as dist import paddle.distributed.auto_parallel as auto from functools import reduce def process_mesh_config(config): class Mesh: def __init__(self, config): self.dp_dim = None self.mp_dim = None self.process_mesh = None self.config = config topology = list( filter(lambda x: x > 1, [ self.config['pp_degree'], self.config['dp_degree'], self.config['mp_degree'] ])) num_proc = 1 if not topology else reduce(lambda x, y: x * y, topology) processes = [i for i in range(num_proc)] if self.config['pp_degree'] > 1: if len(topology) > 1: # dpmppp, dppp, mppp if len(topology) > 2: # dpmppp self.process_mesh = auto.ProcessMesh( np.array(processes).reshape(topology), dim_names=['pp', 'dp', 'mp']) self.dp_dim = 'dp' self.mp_dim = 'mp' elif self.config['dp_degree'] > 1: # dppp self.process_mesh = auto.ProcessMesh( np.array(processes).reshape(topology), dim_names=['pp', 'dp']) self.dp_dim = 'dp' elif self.config['mp_degree'] > 1: # mppp self.process_mesh = auto.ProcessMesh( np.array(processes).reshape(topology), dim_names=['pp', 'mp']) self.mp_dim = 'mp' elif len(topology) == 1: # pp self.process_mesh = auto.ProcessMesh( processes, dim_names=['pp']) else: if len(topology) > 1: # dpmp self.process_mesh = auto.ProcessMesh( np.array(processes).reshape(topology), dim_names=['dp', 'mp']) self.dp_dim = 'dp' self.mp_dim = 'mp' elif self.config['dp_degree'] > 1: # dp self.process_mesh = auto.ProcessMesh( processes, dim_names=['dp']) self.dp_dim = 'dp' elif self.config['mp_degree'] > 1: # mp self.process_mesh = auto.ProcessMesh( processes, dim_names=['mp']) self.mp_dim = 'mp' else: # serial self.process_mesh = auto.ProcessMesh(processes) def __getitem__(self, idx): if 'pp' in self.process_mesh.dim_names: return self.process_mesh[idx] return self.process_mesh def stages(self, num_layers): layer_per_stage = num_layers // self.config['pp_degree'] return [i // layer_per_stage for i in range(num_layers)] @property def dp(self): return self.dp_dim @property def mp(self): return self.mp_dim return Mesh(config) def process_model_configs(config): """ process model configs for auto parallel """ cfg_model = config['Model'] mesh = process_mesh_config(config['Distributed']) cfg_model.update({'mesh': mesh}) if cfg_model['ffn_hidden_size'] is None: cfg_model['ffn_hidden_size'] = 4 * cfg_model['hidden_size'] if cfg_model['use_recompute']: if not cfg_model.get('recompute_granularity', None): cfg_model['recompute_granularity'] = 'full' def process_data_configs(config): """ process data configs for auto parallel """ cfg_global = config['Global'] cfg_data = config['Data'] mode_to_num_samples = { "Train": cfg_global['global_batch_size'] * config['Engine']['max_steps'], "Eval": cfg_global['global_batch_size'] * (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) * config['Engine']['eval_iters'], "Test": cfg_global['global_batch_size'] * config['Engine']['test_iters'], } for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[ mode] cfg_data[mode]['dataset']['mode'] = mode cfg_data[mode]['dataset']['seed'] = cfg_global['seed'] def process_configs(config): process_model_configs(config) process_data_configs(config) return config ================================================ FILE: ppfleetx/models/language_model/debertav2/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .modeling import (get_debertav2_model, debertav2_encode_text, get_debertav2_encoded_dim) from ppfleetx.models.language_model.t5 import normal_, constant_init ================================================ FILE: ppfleetx/models/language_model/debertav2/modeling.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Paddle DeBERTa-v2 model.""" from collections.abc import Sequence from typing import Optional, Tuple, Union import json import paddle from paddle import nn from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from ppfleetx.models.language_model.t5 import (finfo, ACT2FN, ModelOutput, normal_, constant_init) from ppfleetx.data.tokenizers.debertav2_tokenizer import debertav2_tokenize from dataclasses import dataclass class BaseModelOutput(ModelOutput): """ Base class for model's outputs, with potential hidden states and attentions. Args: last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ last_hidden_state = None hidden_states = None attentions = None # Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2 class XSoftmax(paddle.autograd.PyLayer): """ Masked Softmax which is optimized for saving memory Args: input (`paddle.tensor`): The input tensor that will apply softmax. mask (`paddle.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation. dim (int): The dimension that will apply softmax Example: ```python >>> import paddle >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax >>> # Make a tensor >>> x = paddle.randn([4, 20, 100]) >>> # Create a mask >>> mask = (x > 0).int() >>> # Specify the dimension to apply softmax >>> dim = -1 >>> y = XSoftmax.apply(x, mask, dim) ```""" @staticmethod def forward(self, input, mask, dim): self.dim = dim #rmask = ~(mask.cast('bool')) #output = input.masked_fill(rmask, paddle.to_tensor(finfo(input.dtype).min)) mask = mask.cast('bool') output = paddle.where(mask == 0, paddle.to_tensor(finfo(input.dtype).min), input) output = paddle.nn.functional.softmax( output, axis=self.dim, dtype=paddle.float32) output = paddle.where(mask == 0, paddle.to_tensor(0.), output) return output # Copied from transformers.models.deberta.modeling_deberta.DropoutContext class DropoutContext(object): def __init__(self): self.dropout = 0 self.mask = None self.scale = 1 self.reuse_mask = True # Copied from transformers.models.deberta.modeling_deberta.get_mask def get_mask(input, local_context): if not isinstance(local_context, DropoutContext): dropout = local_context mask = None else: dropout = local_context.dropout dropout *= local_context.scale mask = local_context.mask if local_context.reuse_mask else None if dropout > 0 and mask is None: mask = (1 - paddle.bernoulli( paddle.full( shape=input.shape, fill_value=1 - dropout))).cast(bool) if isinstance(local_context, DropoutContext): if local_context.mask is None: local_context.mask = mask return mask, dropout # Copied from transformers.models.deberta.modeling_deberta.XDropout class XDropout(paddle.autograd.PyLayer): """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" @staticmethod def forward(ctx, input, local_ctx): mask, dropout = get_mask(input, local_ctx) ctx.scale = 1.0 / (1 - dropout) if dropout > 0: output = paddle.where(mask == 1, 0, input) return output * ctx.scale else: return input # Copied from transformers.models.deberta.modeling_deberta.StableDropout class StableDropout(nn.Layer): """ Optimized dropout module for stabilizing the training Args: drop_prob (float): the dropout probabilities """ def __init__(self, drop_prob): super().__init__() self.drop_prob = drop_prob self.count = 0 self.context_stack = None def forward(self, x): """ Call the module Args: x (`paddle.to_tensor`): The input tensor to apply dropout """ if self.training and self.drop_prob > 0: return XDropout.apply(x, self.get_context()) return x def clear_context(self): self.count = 0 self.context_stack = None def init_context(self, reuse_mask=True, scale=1): if self.context_stack is None: self.context_stack = [] self.count = 0 for c in self.context_stack: c.reuse_mask = reuse_mask c.scale = scale def get_context(self): if self.context_stack is not None: if self.count >= len(self.context_stack): self.context_stack.append(DropoutContext()) ctx = self.context_stack[self.count] ctx.dropout = self.drop_prob self.count += 1 return ctx else: return self.drop_prob # Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm class DebertaV2SelfOutput(nn.Layer): def __init__(self, hidden_size=1536, layer_norm_eps=1e-7, hidden_dropout_prob=0.1): super().__init__() self.dense = nn.Linear(hidden_size, hidden_size) self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps) self.dropout = StableDropout(hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states # Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2 class DebertaV2Attention(nn.Layer): def __init__( self, hidden_size=512, num_attention_heads=24, attention_head_size=64, share_att_key=True, pos_att_type=None, relative_attention=True, position_buckets=-1, max_relative_positions=-1, max_position_embeddings=512, layer_norm_eps=1e-7, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, ): super().__init__() self.self = DisentangledSelfAttention( hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_head_size=attention_head_size, share_att_key=share_att_key, pos_att_type=pos_att_type, relative_attention=relative_attention, position_buckets=position_buckets, max_relative_positions=max_relative_positions, max_position_embeddings=max_position_embeddings, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob, ) self.output = DebertaV2SelfOutput( hidden_size=hidden_size, layer_norm_eps=layer_norm_eps, hidden_dropout_prob=hidden_dropout_prob) def forward( self, hidden_states, attention_mask, output_attentions=False, query_states=None, relative_pos=None, rel_embeddings=None, ): self_output = self.self( hidden_states, attention_mask, output_attentions, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, ) if output_attentions: self_output, att_matrix = self_output if query_states is None: query_states = hidden_states attention_output = self.output(self_output, query_states) if output_attentions: return (attention_output, att_matrix) else: return attention_output # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2 class DebertaV2Intermediate(nn.Layer): def __init__( self, hidden_size=1536, hidden_act='gelu', intermediate_size=6144, ): super().__init__() self.dense = nn.Linear(hidden_size, intermediate_size) if isinstance(hidden_act, str): self.intermediate_act_fn = ACT2FN[hidden_act] else: self.intermediate_act_fn = hidden_act def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states # Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm class DebertaV2Output(nn.Layer): def __init__( self, hidden_size=512, intermediate_size=6144, layer_norm_eps=1e-7, hidden_dropout_prob=0.1, ): super().__init__() self.dense = nn.Linear(intermediate_size, hidden_size) self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps) self.dropout = StableDropout(hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states # Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2 class DebertaV2Layer(nn.Layer): def __init__( self, hidden_size=512, hidden_act='gelu', intermediate_size=6144, num_attention_heads=24, attention_head_size=64, share_att_key=True, pos_att_type=None, relative_attention=True, position_buckets=256, max_relative_positions=-1, max_position_embeddings=512, layer_norm_eps=1e-7, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, ): super().__init__() self.attention = DebertaV2Attention( hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_head_size=attention_head_size, share_att_key=share_att_key, pos_att_type=pos_att_type, relative_attention=relative_attention, position_buckets=position_buckets, max_relative_positions=max_relative_positions, max_position_embeddings=max_position_embeddings, layer_norm_eps=layer_norm_eps, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob, ) self.intermediate = DebertaV2Intermediate( hidden_size=hidden_size, hidden_act=hidden_act, intermediate_size=intermediate_size, ) self.output = DebertaV2Output( hidden_size=hidden_size, intermediate_size=intermediate_size, layer_norm_eps=layer_norm_eps, hidden_dropout_prob=hidden_dropout_prob, ) def forward( self, hidden_states, attention_mask, query_states=None, relative_pos=None, rel_embeddings=None, output_attentions=False, ): attention_output = self.attention( hidden_states, attention_mask, output_attentions=output_attentions, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, ) if output_attentions: attention_output, att_matrix = attention_output intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) if output_attentions: return (layer_output, att_matrix) else: return layer_output class ConvLayer(nn.Layer): def __init__( self, hidden_size=512, conv_kernel_size=3, conv_groups=1, conv_act="tanh", layer_norm_eps=1e-7, hidden_dropout_prob=0., ): super().__init__() kernel_size = conv_kernel_size groups = conv_groups self.conv_act = conv_act self.conv = nn.Conv1D( hidden_size, hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups) self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps) self.dropout = StableDropout(hidden_dropout_prob) def forward(self, hidden_states, residual_states, input_mask): out = self.conv(hidden_states.transpose([0, 2, 1])).transpose( [0, 2, 1]) out = paddle.where( input_mask.cast('bool').unsqueeze(-1).expand(out.shape) == 0, paddle.to_tensor(0.), out) out = ACT2FN[self.conv_act](self.dropout(out)) layer_norm_input = residual_states + out output = self.LayerNorm(layer_norm_input).cast(layer_norm_input.dtype) if input_mask is None: output_states = output else: if input_mask.dim() != layer_norm_input.dim(): if input_mask.dim() == 4: input_mask = input_mask.squeeze(1).squeeze(1) input_mask = input_mask.unsqueeze(2) input_mask = input_mask.cast(output.dtype) output_states = output * input_mask return output_states class DebertaV2Encoder(nn.Layer): """Modified BertEncoder with relative position bias support""" def __init__( self, num_hidden_layers=48, num_attention_heads=24, attention_head_size=64, relative_attention=False, max_relative_positions=-1, max_position_embeddings=512, position_buckets=256, hidden_size=1536, hidden_act='gelu', conv_act='gelu', intermediate_size=6144, share_att_key=True, pos_att_type=None, norm_rel_ebd=None, conv_kernel_size=0, layer_norm_eps=1e-7, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, ): super().__init__() self.layer = nn.LayerList([ DebertaV2Layer( hidden_size=hidden_size, hidden_act=hidden_act, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads, attention_head_size=attention_head_size, share_att_key=share_att_key, pos_att_type=pos_att_type, relative_attention=relative_attention, position_buckets=position_buckets, max_relative_positions=max_relative_positions, max_position_embeddings=max_position_embeddings, layer_norm_eps=layer_norm_eps, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob) for _ in range(num_hidden_layers) ]) self.relative_attention = relative_attention if self.relative_attention: self.max_relative_positions = max_relative_positions if self.max_relative_positions < 1: self.max_relative_positions = max_position_embeddings self.position_buckets = position_buckets pos_ebd_size = self.max_relative_positions * 2 if self.position_buckets > 0: pos_ebd_size = self.position_buckets * 2 self.rel_embeddings = nn.Embedding(pos_ebd_size, hidden_size) self.norm_rel_ebd = [ x.strip() for x in norm_rel_ebd.lower().split("|") ] if "layer_norm" in self.norm_rel_ebd: self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps) self.conv = ConvLayer( hidden_size=hidden_size, conv_kernel_size=conv_kernel_size, conv_act=conv_act, layer_norm_eps=layer_norm_eps, hidden_dropout_prob=hidden_dropout_prob, ) if conv_kernel_size > 0 else None self.gradient_checkpointing = False def get_rel_embedding(self): rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd): rel_embeddings = self.LayerNorm(rel_embeddings) return rel_embeddings def get_attention_mask(self, attention_mask): if attention_mask.dim() <= 2: extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) attention_mask = extended_attention_mask * extended_attention_mask.squeeze( -2).unsqueeze(-1) attention_mask = attention_mask.cast(paddle.uint8) elif attention_mask.dim() == 3: attention_mask = attention_mask.unsqueeze(1) return attention_mask def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): if self.relative_attention and relative_pos is None: q = query_states.shape[ -2] if query_states is not None else hidden_states.shape[-2] relative_pos = build_relative_position( q, hidden_states.shape[-2], bucket_size=self.position_buckets, max_position=self.max_relative_positions) return relative_pos def forward( self, hidden_states, attention_mask, output_hidden_states=True, output_attentions=False, query_states=None, relative_pos=None, return_dict=True, ): if attention_mask.dim() <= 2: input_mask = attention_mask else: input_mask = (attention_mask.sum(-2) > 0).cast(paddle.uint8) attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None if isinstance(hidden_states, Sequence): next_kv = hidden_states[0] else: next_kv = hidden_states rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (output_states, ) if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, output_attentions) return custom_forward output_states = paddle.utils.checkpoint.checkpoint( create_custom_forward(layer_module), next_kv, attention_mask, query_states, relative_pos, rel_embeddings, ) else: output_states = layer_module( next_kv, attention_mask, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings, output_attentions=output_attentions, ) if output_attentions: output_states, att_m = output_states if i == 0 and self.conv is not None: output_states = self.conv(hidden_states, output_states, input_mask) if query_states is not None: query_states = output_states if isinstance(hidden_states, Sequence): next_kv = hidden_states[i + 1] if i + 1 < len( self.layer) else None else: next_kv = output_states if output_attentions: all_attentions = all_attentions + (att_m, ) if output_hidden_states: all_hidden_states = all_hidden_states + (output_states, ) if not return_dict: return tuple( v for v in [output_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions) def make_log_bucket_position(relative_pos, bucket_size, max_position): sign = paddle.sign(relative_pos.cast('float32')) mid = bucket_size // 2 abs_pos = paddle.where( (relative_pos < mid) & (relative_pos > -mid), paddle.to_tensor(mid - 1).astype(relative_pos.dtype), paddle.abs(relative_pos), ) log_pos = (paddle.ceil( paddle.log(abs_pos / mid) / paddle.log(paddle.to_tensor((max_position - 1) / mid)) * (mid - 1)) + mid) bucket_pos = paddle.where(abs_pos <= mid, relative_pos.cast(log_pos.dtype), log_pos * sign) return bucket_pos def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1): """ Build relative position according to the query and key We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q - P_k\\) Args: query_size (int): the length of query key_size (int): the length of key bucket_size (int): the size of position bucket max_position (int): the maximum allowed absolute position Return: `paddle.LongTensor`: A tensor with shape [1, query_size, key_size] """ q_ids = paddle.arange(0, query_size) k_ids = paddle.arange(0, key_size) rel_pos_ids = q_ids[:, None] - k_ids[None, :] if bucket_size > 0 and max_position > 0: rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position) rel_pos_ids = rel_pos_ids.cast(paddle.int64) rel_pos_ids = rel_pos_ids[:query_size, :] rel_pos_ids = rel_pos_ids.unsqueeze(0) return rel_pos_ids # Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos): return c2p_pos.expand([ query_layer.shape[1], query_layer.shape[1], query_layer.shape[2], relative_pos.shape[-1] ]) # Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand def p2c_dynamic_expand(c2p_pos, query_layer, key_layer): return c2p_pos.expand([ query_layer.shape[0], query_layer.shape[1], key_layer.shape[-2], key_layer.shape[-2] ]) # Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand def pos_dynamic_expand(pos_index, p2c_att, key_layer): return pos_index.expand([ tuplt(p2c_att.shape[:2]) + (pos_index.shape[-2], key_layer.shape[-2]) ]) class DisentangledSelfAttention(nn.Layer): """ Disentangled self-attention module Parameters: """ def __init__( self, hidden_size=1536, num_attention_heads=24, attention_head_size=None, share_att_key=False, pos_att_type=None, relative_attention=False, position_buckets=-1, max_relative_positions=-1, max_position_embeddings=512, hidden_dropout_prob=0., attention_probs_dropout_prob=0., ): super().__init__() if hidden_size % num_attention_heads != 0: raise ValueError( f"The hidden size ({hidden_size}) is not a multiple of the number of attention " f"heads ({num_attention_heads})") self.num_attention_heads = num_attention_heads _attention_head_size = hidden_size // num_attention_heads self.attention_head_size = attention_head_size if attention_head_size is not None else _attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size self.query_proj = nn.Linear(hidden_size, self.all_head_size) self.key_proj = nn.Linear(hidden_size, self.all_head_size) self.value_proj = nn.Linear(hidden_size, self.all_head_size) self.share_att_key = share_att_key self.pos_att_type = pos_att_type if pos_att_type is not None else [] self.relative_attention = relative_attention if self.relative_attention: self.position_buckets = position_buckets self.max_relative_positions = max_relative_positions if self.max_relative_positions < 1: self.max_relative_positions = max_position_embeddings self.pos_ebd_size = self.max_relative_positions if self.position_buckets > 0: self.pos_ebd_size = self.position_buckets self.pos_dropout = StableDropout(hidden_dropout_prob) if not self.share_att_key: if "c2p" in self.pos_att_type: self.pos_key_proj = nn.Linear( hidden_size, self.all_head_size, bias=True) if "p2c" in self.pos_att_type: self.pos_query_proj = nn.Linear(hidden_size, self.all_head_size) self.dropout = StableDropout(attention_probs_dropout_prob) def transpose_for_scores(self, x, attention_heads): new_x_shape = tuple(x.shape[:-1]) + (attention_heads, -1) x = x.reshape(new_x_shape) return x.transpose([0, 2, 1, 3]).reshape([-1, x.shape[1], x.shape[-1]]) def forward( self, hidden_states, attention_mask, output_attentions=False, query_states=None, relative_pos=None, rel_embeddings=None, ): """ Call the module Args: hidden_states (`paddle.FloatTensor`): Input states to the module usually the output from previous layer, it will be the Q,K and V in *Attention(Q,K,V)* attention_mask (`paddle.uint8`): An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j* th token. output_attentions (`bool`, optional): Whether return the attention matrix. query_states (`paddle.FloatTensor`, optional): The *Q* state in *Attention(Q,K,V)*. relative_pos (`paddle.LongTensor`): The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with values ranging in [*-max_relative_positions*, *max_relative_positions*]. rel_embeddings (`paddle.FloatTensor`): The embedding of relative distances. It's a tensor of shape [\\(2 \\times \\text{max_relative_positions}\\), *hidden_size*]. """ if query_states is None: query_states = hidden_states query_layer = self.transpose_for_scores( self.query_proj(query_states), self.num_attention_heads) key_layer = self.transpose_for_scores( self.key_proj(hidden_states), self.num_attention_heads) value_layer = self.transpose_for_scores( self.value_proj(hidden_states), self.num_attention_heads) rel_att = None # Take the dot product between "query" and "key" to get the raw attention scores. scale_factor = 1 if "c2p" in self.pos_att_type: scale_factor += 1 if "p2c" in self.pos_att_type: scale_factor += 1 scale = paddle.sqrt( paddle.to_tensor( query_layer.shape[-1], dtype='float32') * scale_factor) attention_scores = paddle.bmm( query_layer, key_layer.transpose( [0, 2, 1])) / scale.cast(query_layer.dtype) if self.relative_attention: rel_embeddings = self.pos_dropout(rel_embeddings) rel_att = self.disentangled_attention_bias( query_layer, key_layer, relative_pos, rel_embeddings, scale_factor) if rel_att is not None: attention_scores = attention_scores + rel_att attention_scores = attention_scores attention_scores = attention_scores.reshape([ -1, self.num_attention_heads, attention_scores.shape[-2], attention_scores.shape[-1] ]) # bsz x height x length x dimension attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) attention_probs = self.dropout(attention_probs) context_layer = paddle.bmm( attention_probs.reshape( [-1, attention_probs.shape[-2], attention_probs.shape[-1]]), value_layer) context_layer = (context_layer.reshape([ -1, self.num_attention_heads, context_layer.shape[-2], context_layer.shape[-1] ]).transpose([0, 2, 1, 3])) new_context_layer_shape = tuple(context_layer.shape[:-2]) + (-1, ) context_layer = context_layer.reshape(new_context_layer_shape) if output_attentions: return (context_layer, attention_probs) else: return context_layer def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): if relative_pos is None: q = query_layer.shape[-2] relative_pos = build_relative_position( q, key_layer.shape[-2], bucket_size=self.position_buckets, max_position=self.max_relative_positions) if relative_pos.dim() == 2: relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) elif relative_pos.dim() == 3: relative_pos = relative_pos.unsqueeze(1) # bsz x height x query x key elif relative_pos.dim() != 4: raise ValueError( f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}" ) att_span = self.pos_ebd_size relative_pos = relative_pos.cast(paddle.int64) rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0) if self.share_att_key: pos_query_layer = paddle.tile( self.transpose_for_scores( self.query_proj(rel_embeddings), self.num_attention_heads), repeat_times=[ query_layer.shape[0] // self.num_attention_heads, 1, 1 ]) pos_key_layer = paddle.tile( self.transpose_for_scores( self.key_proj(rel_embeddings), self.num_attention_heads), repeat_times=[ query_layer.shape[0] // self.num_attention_heads, 1, 1 ]) else: if "c2p" in self.pos_att_type: pos_key_layer = paddle.tile( self.transpose_for_scores( self.pos_key_proj(rel_embeddings), self.num_attention_heads), repeat_times=[ query_layer.shape[0] // self.num_attention_heads, 1, 1 ]) # .split(self.all_head_size, dim=-1) if "p2c" in self.pos_att_type: pos_query_layer = paddle.tile( self.transpose_for_scores( self.pos_query_proj(rel_embeddings), self.num_attention_heads), repeat_times=[ query_layer.shape[0] // self.num_attention_heads, 1, 1 ]) # .split(self.all_head_size, dim=-1) score = 0 # content->position if "c2p" in self.pos_att_type: scale = paddle.sqrt( paddle.to_tensor( pos_key_layer.shape[-1], dtype='float32') * scale_factor) c2p_att = paddle.bmm(query_layer, pos_key_layer.transpose([0, 2, 1])) c2p_pos = paddle.clip(relative_pos + att_span, 0, att_span * 2 - 1) c2p_att = paddle.take_along_axis( c2p_att, axis=-1, indices=c2p_pos.squeeze(0).expand([ query_layer.shape[0], query_layer.shape[1], relative_pos.shape[-1] ]), ) score += c2p_att / scale.cast(dtype=c2p_att.dtype) # position->content if "p2c" in self.pos_att_type: scale = paddle.sqrt( paddle.to_tensor( pos_query_layer.shape[-1], dtype='float32') * scale_factor) if key_layer.shape[-2] != query_layer.shape[-2]: r_pos = build_relative_position( key_layer.shape[-2], key_layer.shape[-2], bucket_size=self.position_buckets, max_position=self.max_relative_positions, ) r_pos = r_pos.unsqueeze(0) else: r_pos = relative_pos p2c_pos = paddle.clip(-r_pos + att_span, 0, att_span * 2 - 1) p2c_att = paddle.bmm(key_layer, pos_query_layer.transpose([0, 2, 1])) p2c_att = paddle.take_along_axis( p2c_att, axis=-1, indices=p2c_pos.squeeze(0).expand([ query_layer.shape[0], key_layer.shape[-2], key_layer.shape[-2] ]), ).transpose([0, 2, 1]) score += p2c_att / scale.cast(dtype=p2c_att.dtype) return score # Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm class DebertaV2Embeddings(nn.Layer): """Construct the embeddings from word, position and token_type embeddings.""" def __init__( self, max_position_embeddings=512, position_biased_input=False, pad_token_id=0, hidden_size=1536, hidden_dropout_prob=0.1, embedding_size=None, vocab_size=128100, type_vocab_size=0, layer_norm_eps=1e-7, ): super().__init__() self.embedding_size = hidden_size if embedding_size is None else embedding_size self.word_embeddings = nn.Embedding( vocab_size, self.embedding_size, padding_idx=pad_token_id) self.type_vocab_size = type_vocab_size self.hidden_size = hidden_size self.position_biased_input = position_biased_input if not self.position_biased_input: self.position_embeddings = None else: self.position_embeddings = nn.Embedding(max_position_embeddings, self.embedding_size) if type_vocab_size > 0: self.token_type_embeddings = nn.Embedding(type_vocab_size, self.embedding_size) if self.embedding_size != hidden_size: self.embed_proj = nn.Linear(self.embedding_size, hidden_size) self.LayerNorm = LayerNorm(hidden_size, layer_norm_eps) self.dropout = StableDropout(hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer("position_ids", paddle.arange(max_position_embeddings).expand( (1, -1))) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): if input_ids is not None: input_shape = input_ids.shape else: input_shape = inputs_embeds.shape[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = self.position_ids[:, :seq_length] if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) if self.position_embeddings is not None: position_embeddings = self.position_embeddings( position_ids.cast(paddle.int64)) else: position_embeddings = paddle.zeros_like(inputs_embeds) embeddings = inputs_embeds if self.position_biased_input: embeddings += position_embeddings if self.type_vocab_size > 0: token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings += token_type_embeddings if self.embedding_size != self.hidden_size: embeddings = self.embed_proj(embeddings) embeddings = self.LayerNorm(embeddings) if mask is not None: if mask.dim() != embeddings.dim(): if mask.dim() == 4: mask = mask.squeeze(1).squeeze(1) mask = mask.unsqueeze(2) mask = mask.cast('float32') embeddings = embeddings * mask embeddings = self.dropout(embeddings) return embeddings # Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2 class DebertaV2PreTrainedModel(nn.Layer): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ base_model_prefix = "deberta" _keys_to_ignore_on_load_missing = ["position_ids"] _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 normal_(module.weight, mean=0.0, std=0.02) if module.bias is not None: constant_init(module.bias, 0.) elif isinstance(module, nn.Embedding): normal_(module.weight, mean=0.0, std=0.02) if module.padding_idx is not None: constant_init(module.weight.data[module.padding_idx], 0.) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, DebertaV2Encoder): module.gradient_checkpointing = value DEBERTA_START_DOCSTRING = r""" The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. This model is also a PyTorch [paddle.nn.Layer](https://pytorch.org/docs/stable/nn.html#paddle.nn.Layer) subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Parameters: """ # Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2 class DebertaV2Model(DebertaV2PreTrainedModel): def __init__(self, _name_or_path="cache/deberta-v-xxlarge", attention_head_size=64, attention_probs_dropout_prob=0.1, conv_act="gelu", conv_kernel_size=3, hidden_act="gelu", hidden_dropout_prob=0.1, hidden_size=1536, initializer_range=0.02, intermediate_size=6144, layer_norm_eps=1e-07, max_position_embeddings=512, max_relative_positions=-1, model_type="deberta-v2", norm_rel_ebd="layer_norm", num_attention_heads=24, num_hidden_layers=48, pad_token_id=0, pooler_dropout=0, pooler_hidden_act="gelu", pooler_hidden_size=1536, pos_att_type=["p2c", "c2p"], position_biased_input=False, position_buckets=256, relative_attention=True, share_att_key=True, type_vocab_size=0, vocab_size=128100, output_attentions=False, output_hidden_states=False, use_return_dict=True): super().__init__() self.embeddings = DebertaV2Embeddings( max_position_embeddings=max_position_embeddings, position_biased_input=position_biased_input, pad_token_id=pad_token_id, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, vocab_size=vocab_size, type_vocab_size=type_vocab_size, layer_norm_eps=layer_norm_eps) self.encoder = DebertaV2Encoder( num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, attention_head_size=attention_head_size, relative_attention=relative_attention, max_relative_positions=max_relative_positions, max_position_embeddings=max_position_embeddings, position_buckets=position_buckets, hidden_size=hidden_size, norm_rel_ebd=norm_rel_ebd, conv_kernel_size=conv_kernel_size, hidden_act=hidden_act, conv_act=conv_act, intermediate_size=intermediate_size, share_att_key=share_att_key, pos_att_type=pos_att_type, layer_norm_eps=layer_norm_eps, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob, ) self.z_steps = 0 self.output_attentions = output_attentions self.output_hidden_states = output_hidden_states self.use_return_dict = use_return_dict def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ raise NotImplementedError( "The prune function is not implemented in DeBERTa model.") def forward( self, input_ids: Optional[paddle.Tensor]=None, attention_mask: Optional[paddle.Tensor]=None, token_type_ids: Optional[paddle.Tensor]=None, position_ids: Optional[paddle.Tensor]=None, inputs_embeds: Optional[paddle.Tensor]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, ) -> Union[Tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.output_hidden_states) return_dict = return_dict if return_dict is not None else self.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.shape elif inputs_embeds is not None: input_shape = inputs_embeds.shape[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") if attention_mask is None: attention_mask = paddle.ones(input_shape) if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids, mask=attention_mask, inputs_embeds=inputs_embeds, ) encoder_outputs = self.encoder( embedding_output, attention_mask, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict, ) encoded_layers = encoder_outputs[1] if self.z_steps > 1: hidden_states = encoded_layers[-2] layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] query_states = encoded_layers[-1] rel_embeddings = self.encoder.get_rel_embedding() attention_mask = self.encoder.get_attention_mask(attention_mask) rel_pos = self.encoder.get_rel_pos(embedding_output) for layer in layers[1:]: query_states = layer( hidden_states, attention_mask, output_attentions=False, query_states=query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings, ) encoded_layers.append(query_states) sequence_output = encoded_layers[-1] if not return_dict: return (sequence_output, ) + encoder_outputs[(1 if output_hidden_states else 2):] return BaseModelOutput( last_hidden_state=sequence_output, hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, attentions=encoder_outputs.attentions, ) def get_debertav2_model(name, pretrained=True): if name is None: return None model = DebertaV2Model( _name_or_path=name, attention_head_size=64, attention_probs_dropout_prob=0.1, conv_act="gelu", conv_kernel_size=3, hidden_act="gelu", hidden_dropout_prob=0.1, hidden_size=1536, initializer_range=0.02, intermediate_size=6144, layer_norm_eps=1e-07, max_position_embeddings=512, max_relative_positions=-1, model_type="deberta-v2", norm_rel_ebd="layer_norm", num_attention_heads=24, num_hidden_layers=48, pad_token_id=0, pooler_dropout=0, pooler_hidden_act="gelu", pooler_hidden_size=1536, pos_att_type=["p2c", "c2p"], position_biased_input=False, position_buckets=256, relative_attention=True, share_att_key=True, type_vocab_size=0, vocab_size=128100, output_attentions=False, output_hidden_states=False, use_return_dict=True, ) if pretrained: checkpoint = paddle.load(name + '/debertav2.pd', return_numpy=True) model.set_state_dict(checkpoint['model']) model.eval() for p in model.parameters(): p.stop_gradient = True return model def dict_from_json_file(name): with open(name + '/config.json', "r", encoding="utf-8") as reader: text = reader.read() config_dict = json.loads(text) return config_dict def debertav2_encode_text(debertav2, texts, tokenizer, return_attn_mask=False): token_ids, attn_mask = debertav2_tokenize(texts, tokenizer) debertav2.eval() with paddle.no_grad(): output = debertav2(input_ids=token_ids, attention_mask=attn_mask) encoded_text = output.last_hidden_state.detach() attn_mask = attn_mask.cast(bool) encoded_text = paddle.where(attn_mask[:, :, None] == 0, paddle.to_tensor(0.), encoded_text) if return_attn_mask: return encoded_text, attn_mask return encoded_text def get_debertav2_encoded_dim(name): return dict_from_json_file(name)['hidden_size'] if __name__ == '__main__': model = get_debertav2_model( name='/dbq/codes/CL/paddle-imagen/cache/deberta-v-xxlarge', pretrained=False) ================================================ FILE: ppfleetx/models/language_model/ernie/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .ernie_module import ErnieModule, ErnieSeqClsModule from .auto.auto_module import ErnieModuleAuto, ErnieSeqClsModuleAuto ================================================ FILE: ppfleetx/models/language_model/ernie/auto/__init__.py ================================================ ================================================ FILE: ppfleetx/models/language_model/ernie/auto/auto_model.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import io import copy import logging import json import paddle import paddle.nn as nn import paddle.distributed.auto_parallel as auto from paddle.nn import functional as F from paddle.nn.initializer.lazy_init import _lazy_init_helper from dataclasses import dataclass, field from ..layers.model_outputs import ( BaseModelOutputWithPoolingAndCrossAttentions, ModelOutput, ErnieForPreTrainingOutput, SequenceClassifierOutput, ) from .auto_transformer import TransformerEncoderLayer, TransformerEncoder class Embedding(nn.Layer): def __init__( self, num_embeddings, embedding_dim, padding_idx=None, sparse=False, weight_attr=None, name=None, ): super().__init__() self._num_embeddings = num_embeddings self._embedding_dim = embedding_dim self._sparse = sparse self._is_distributed = False self._padding_idx = padding_idx if self._num_embeddings <= 0: raise ValueError("num_embeddings must be gather than 0") if self._embedding_dim <= 0: raise ValueError("embedding_dim must be gather than 0") padding_idx = (-1 if padding_idx is None else padding_idx if padding_idx >= 0 else (num_embeddings + padding_idx)) if padding_idx >= num_embeddings or padding_idx < -num_embeddings: raise ValueError("padding_idx must be within [-{}, {})".format( num_embeddings, num_embeddings)) self._dtype = self._helper.get_default_dtype() self._size = [self._num_embeddings, self._embedding_dim] self._weight_attr = weight_attr self._remote_prefetch = False self._name = name self.weight = self.create_parameter( attr=self._weight_attr, shape=self._size, dtype=self._dtype, is_bias=False, ) if paddle.in_dynamic_mode( ) and padding_idx != -1 and not _lazy_init_helper.state: with paddle.no_grad(): self.weight[padding_idx] = 0.0 def forward(self, x): return F.embedding( x, weight=self.weight, padding_idx=self._padding_idx, sparse=self._sparse, name=self._name, ) def extra_repr(self): main_str = '{_num_embeddings}, {_embedding_dim}' if self._padding_idx is not None: main_str += ', padding_idx={_padding_idx}' main_str += ', sparse={_sparse}' if self._name is not None: main_str += ', name={_name}' return main_str.format(**self.__dict__) class ErnieEmbeddings(nn.Layer): r""" Include embeddings from word, position and token_type embeddings. """ def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, pad_token_id=0, weight_attr=None, task_type_vocab_size=3, task_id=0, use_task_id=False, mesh=None): super(ErnieEmbeddings, self).__init__() self.mesh = mesh self.word_embeddings = Embedding( vocab_size, hidden_size, padding_idx=pad_token_id, weight_attr=weight_attr) self.position_embeddings = nn.Embedding( max_position_embeddings, hidden_size, weight_attr=weight_attr) self.type_vocab_size = type_vocab_size if self.type_vocab_size > 0: self.token_type_embeddings = nn.Embedding( type_vocab_size, hidden_size, weight_attr=weight_attr) self.use_task_id = use_task_id self.task_id = task_id if self.use_task_id: self.task_type_embeddings = nn.Embedding( task_type_vocab_size, hidden_size, weight_attr=weight_attr) self.layer_norm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None, position_ids=None, task_type_ids=None, inputs_embeds=None, past_key_values_length=None): if input_ids is not None: auto.shard_tensor(self.word_embeddings.weight, self.mesh[0], [self.mesh.mp, None]) input_shape = paddle.shape(input_ids) input_embeddings = self.word_embeddings(input_ids) else: input_shape = paddle.shape(inputs_embeds)[:-1] input_embeddings = inputs_embeds if position_ids is None: # maybe need use shape op to unify static graph and dynamic graph #seq_length = input_ids.shape[1] ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) position_ids = seq_length - ones if past_key_values_length is not None: position_ids += past_key_values_length position_ids.stop_gradient = True position_embeddings = self.position_embeddings(position_ids) embeddings = input_embeddings + position_embeddings if self.type_vocab_size > 0: if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype="int64") token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = embeddings + token_type_embeddings if self.use_task_id: if task_type_ids is None: task_type_ids = paddle.ones( input_shape, dtype="int64") * self.task_id task_type_embeddings = self.task_type_embeddings(task_type_ids) embeddings = embeddings + task_type_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings class ErniePooler(nn.Layer): def __init__(self, hidden_size, weight_attr=None): super(ErniePooler, self).__init__() self.dense = nn.Linear( hidden_size, hidden_size, weight_attr=weight_attr) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class ErnieModelAuto(nn.Layer): r""" The bare ERNIE Model transformer outputting raw hidden-states. This model is a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Args: vocab_size (int): Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`. hidden_size (int, optional): Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`. num_hidden_layers (int, optional): Number of hidden layers in the Transformer encoder. Defaults to `12`. num_attention_heads (int, optional): Number of attention heads for each attention layer in the Transformer encoder. Defaults to `12`. intermediate_size (int, optional): Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to ff layers are firstly projected from `hidden_size` to `intermediate_size`, and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`. Defaults to `3072`. hidden_act (str, optional): The non-linear activation function in the feed-forward layer. ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported. Defaults to `"gelu"`. hidden_dropout_prob (float, optional): The dropout probability for all fully connected layers in the embeddings and encoder. Defaults to `0.1`. attention_probs_dropout_prob (float, optional): The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target. Defaults to `0.1`. max_position_embeddings (int, optional): The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input sequence. Defaults to `512`. type_vocab_size (int, optional): The vocabulary size of the `token_type_ids`. Defaults to `2`. initializer_range (float, optional): The standard deviation of the normal initializer for initializing all weight matrices. Defaults to `0.02`. .. note:: A normal_initializer initializes weight matrices as normal distributions. See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`. pad_token_id(int, optional): The index of padding token in the token vocabulary. Defaults to `0`. """ def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0, task_type_vocab_size=3, task_id=0, use_task_id=False, use_recompute=False, mesh=None): super(ErnieModelAuto, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.initializer_range)) self.embeddings = ErnieEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, pad_token_id, weight_attr, task_type_vocab_size, task_id, use_task_id, mesh) encoder_layer = TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, weight_attr=weight_attr, normalize_before=False, mesh=mesh, mesh_idx=0) self.encoder = TransformerEncoder( encoder_layer, num_hidden_layers, enable_recompute=use_recompute, mesh=mesh) self.pooler = ErniePooler(hidden_size, weight_attr) self.apply(self.init_weights) def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, task_type_ids=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" Args: input_ids (Tensor): Indices of input sequence tokens in the vocabulary. They are numerical representations of tokens that build the input sequence. It's data type should be `int64` and has a shape of [batch_size, sequence_length]. token_type_ids (Tensor, optional): Segment token indices to indicate different portions of the inputs. Selected in the range ``[0, type_vocab_size - 1]``. If `type_vocab_size` is 2, which means the inputs have two portions. Indices can either be 0 or 1: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. Defaults to `None`, which means we don't add segment embeddings. position_ids (Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, max_position_embeddings - 1]``. Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`. attention_mask (Tensor, optional): Mask used in multi-head attention to avoid performing attention on to some unwanted positions, usually the paddings or the subsequent positions. Its data type can be int, float and bool. When the data type is bool, the `masked` tokens have `False` values and the others have `True` values. When the data type is int, the `masked` tokens have `0` values and the others have `1` values. When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values. It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`. For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length], [batch_size, num_attention_heads, sequence_length, sequence_length]. We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word, "使" and "用" will have the same value. Defaults to `None`, which means nothing needed to be prevented attention to. inputs_embeds (Tensor, optional): If you want to control how to convert `inputs_ids` indices into associated vectors, you can pass an embedded representation directly instead of passing `inputs_ids`. past_key_values (tuple(tuple(Tensor)), optional): The length of tuple equals to the number of layers, and each inner tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`) which contains precomputed key and value hidden states of the attention blocks. If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` of shape `(batch_size, sequence_length)`. use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are returned. Defaults to `None`. output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`. """ if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time." ) elif input_ids is not None: input_shape = paddle.shape(input_ids) elif inputs_embeds is not None: input_shape = paddle.shape(inputs_embeds)[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") past_key_values_length = None if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] if attention_mask is None: attention_mask = paddle.unsqueeze( (input_ids == self.pad_token_id ).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]) if past_key_values is not None: batch_size = past_key_values[0][0].shape[0] past_mask = paddle.zeros( [batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) attention_mask = paddle.concat( [past_mask, attention_mask], axis=-1) # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze( attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, task_type_ids=task_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length) self.encoder._use_cache = use_cache # To be consistent with HF encoder_outputs = self.encoder( embedding_output, src_mask=attention_mask, cache=past_key_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) if isinstance(encoder_outputs, type(embedding_output)): sequence_output = encoder_outputs pooled_output = self.pooler(sequence_output) return (sequence_output, pooled_output) else: sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): # only support dygraph, use truncated_normal and make it inplace # and configurable later if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 class ErnieLMPredictionHead(nn.Layer): r""" Ernie Model with a `language modeling` head on top. """ def __init__( self, hidden_size, vocab_size, activation, embedding_weights=None, weight_attr=None, ): super(ErnieLMPredictionHead, self).__init__() self.transform = nn.Linear( hidden_size, hidden_size, weight_attr=weight_attr) self.activation = getattr(nn.functional, activation) self.layer_norm = nn.LayerNorm(hidden_size) self.decoder_weight = self.create_parameter( shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, attr=weight_attr, is_bias=False) # if embedding_weights is None else embedding_weights self.decoder_bias = self.create_parameter( shape=[self.decoder_weight.shape[0]], dtype=self.decoder_weight.dtype, is_bias=True) def forward(self, hidden_states, masked_positions=None): if masked_positions is not None: hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]]) hidden_states = paddle.tensor.gather(hidden_states, masked_positions) # gather masked tokens might be more quick hidden_states = self.transform(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.layer_norm(hidden_states) # hidden_states = parallel_matmul(hidden_states, self.decoder_weight, True) + self.decoder_bias hidden_states = paddle.matmul( hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias return hidden_states class ErniePretrainingHeads(nn.Layer): def __init__( self, hidden_size, vocab_size, activation, embedding_weights=None, weight_attr=None, ): super(ErniePretrainingHeads, self).__init__() self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size, activation, embedding_weights, weight_attr) self.seq_relationship = nn.Linear( hidden_size, 2, weight_attr=weight_attr) def forward(self, sequence_output, pooled_output, masked_positions=None): prediction_scores = self.predictions(sequence_output, masked_positions) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class ErnieForPretrainingAuto(nn.Layer): r""" Ernie Model with a `masked language modeling` head and a `sentence order prediction` head on top. """ def __init__(self, ernie): super(ErnieForPretrainingAuto, self).__init__() self.ernie = ernie weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.ernie.initializer_range)) self.cls = ErniePretrainingHeads( self.ernie.hidden_size, self.ernie.vocab_size, self.ernie.hidden_act, embedding_weights=self.ernie.embeddings.word_embeddings.weight, weight_attr=weight_attr, ) self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_positions=None, position_ids=None, inputs_embeds=None, labels=None, next_sentence_label=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" Args: input_ids (Tensor): See :class:`ErnieModel`. token_type_ids (Tensor, optional): See :class:`ErnieModel`. position_ids (Tensor, optional): See :class:`ErnieModel`. attention_mask (Tensor, optional): See :class:`ErnieModel`. inputs_embeds(Tensor, optional): See :class:`ErnieModel`. labels (Tensor of shape `(batch_size, sequence_length)`, optional): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. next_sentence_label (Tensor of shape `(batch_size,)`, optional): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`: - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`. """ # with paddle.static.amp.fp16_guard(): outputs = self.ernie( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls( sequence_output, pooled_output, masked_positions) total_loss = None if labels is not None and next_sentence_label is not None: loss_fct = paddle.nn.CrossEntropyLoss() masked_lm_loss = loss_fct( prediction_scores.reshape( (-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1, ))) next_sentence_loss = loss_fct( seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1, ))) total_loss = masked_lm_loss + next_sentence_loss if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ( (total_loss, ) + output) if total_loss is not None else output return ErnieForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): # only support dygraph, use truncated_normal and make it inplace # and configurable later if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 class ErniePretrainingCriterionAuto(paddle.nn.Layer): r""" The loss output of Ernie Model during the pretraining: a `masked language modeling` head and a `next sentence prediction (classification)` head. """ def __init__(self, with_nsp_loss=True): super(ErniePretrainingCriterionAuto, self).__init__() self.with_nsp_loss = with_nsp_loss def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels=None): """ Args: prediction_scores(Tensor): The scores of masked token prediction. Its data type should be float32. If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. Otherwise, its shape is [batch_size, mask_token_num, vocab_size] seq_relationship_score(Tensor): The scores of next sentence prediction. Its data type should be float32 and its shape is [batch_size, 2] masked_lm_labels(Tensor): The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`. Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1]. Otherwise, its shape is [batch_size, mask_token_num, 1] next_sentence_labels(Tensor): The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels` is equal to `seq_relation_labels`. Its data type should be int64 and its shape is [batch_size, 1] Returns: Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`. Its data type should be float32 and its shape is [1]. """ # with paddle.static.amp.fp16_guard(): masked_lm_loss = F.cross_entropy( prediction_scores, masked_lm_labels, ignore_index=-1, reduction='none') if not self.with_nsp_loss: return paddle.mean(masked_lm_loss) next_sentence_loss = F.cross_entropy( seq_relationship_score, next_sentence_labels, reduction='none') loss = paddle.mean(masked_lm_loss) + paddle.mean(next_sentence_loss) return loss class ErnieForSequenceClassificationAuto(nn.Layer): """ Ernie Model with a linear layer on top of the output layer, designed for sequence classification/regression tasks like GLUE tasks. Args: ernie (:class:`ErnieModel`): An instance of ErnieModel. num_classes (int, optional): The number of classes. Defaults to `2`. dropout (float, optional): The dropout probability for output of ERNIE. If None, use the same value as `hidden_dropout_prob` of `ErnieModel` instance `ernie`. Defaults to None. """ def __init__(self, ernie, num_classes=2, dropout=None): super(ErnieForSequenceClassificationAuto, self).__init__() self.num_classes = num_classes self.ernie = ernie # allow ernie to be config self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.hidden_dropout_prob) self.classifier = nn.Linear(self.ernie.hidden_size, num_classes) self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" The ErnieForSequenceClassification forward method, overrides the __call__() special method. Args: input_ids (Tensor): See :class:`ErnieModelAuto`. token_type_ids (Tensor, optional): See :class:`ErnieModelAuto`. position_ids(Tensor, optional): See :class:`ErnieModelAuto`. attention_mask (Tensor, optional): See :class:`ErnieModelAuto`. labels (Tensor of shape `(batch_size,)`, optional): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1` a regression loss is computed (Mean-Square loss), If `num_classes > 1` a classification loss is computed (Cross-Entropy). output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`. """ outputs = self.ernie( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.num_classes == 1: loss_fct = paddle.nn.MSELoss() loss = loss_fct(logits, labels) elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32: loss_fct = paddle.nn.CrossEntropyLoss() loss = loss_fct( logits.reshape((-1, self.num_classes)), labels.reshape((-1, ))) else: loss_fct = paddle.nn.BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else ( output[0] if len(output) == 1 else output) return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 ================================================ FILE: ppfleetx/models/language_model/ernie/auto/auto_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import copy import paddle from paddle import LazyGuard from ppfleetx.core.module.basic_module import BasicModule from ppfleetx.utils.log import logger from .auto_model import ( ErnieModelAuto, ErnieForPretrainingAuto, ErniePretrainingCriterionAuto, ErnieForSequenceClassificationAuto, ) from ppfleetx.models.language_model.auto_utils import process_configs, process_mesh_config import numpy as np def process_data_configs(config): """ process data configs for hybrid parallel """ cfg_global = config['Global'] cfg_data = config['Data'] mode_to_num_samples = { "Train": cfg_global['global_batch_size'] * config['Engine']['max_steps'], "Eval": cfg_global['global_batch_size'] * (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) * config['Engine']['eval_iters'], "Test": cfg_global['global_batch_size'] * config['Engine']['test_iters'], } for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[ mode] cfg_data[mode]['dataset']['mode'] = mode cfg_data[mode]['dataset']['seed'] = cfg_global['seed'] cfg_data[mode]['dataset'].setdefault('binary_head', cfg_global['binary_head']) cfg_data[mode]['collate_fn'].setdefault( 'micro_batch_size', cfg_global['micro_batch_size']) def process_model_configs(config): mesh = process_mesh_config(config['Distributed']) cfg_model = config['Model'] hidden_size = cfg_model['hidden_size'] cfg_model.update({'mesh': mesh}) cfg_model.setdefault("intermediate_size", hidden_size * 4) class ErnieModuleAuto(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() super(ErnieModuleAuto, self).__init__(configs) self.nranks = paddle.distributed.get_world_size() self.binary_head = self.configs['Global']['binary_head'] self.loss_fn = ErniePretrainingCriterionAuto(self.binary_head) def process_configs(self, configs): process_data_configs(configs) process_model_configs(configs) return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") with LazyGuard(): model = ErnieForPretrainingAuto(ErnieModelAuto(**model_setting)) return model def input_spec(self): inputs_spec = [ paddle.static.InputSpec( shape=[None, None], name="input_ids", dtype="int64"), paddle.static.InputSpec( shape=[None, None], name="token_type_ids", dtype="int64"), paddle.static.InputSpec( shape=[None, None], name="position_ids", dtype="int64"), ] return inputs_spec class ErnieSeqClsModuleAuto(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() super(ErnieSeqClsModuleAuto, self).__init__(configs) def process_configs(self, configs): process_model_configs(configs) cfg_global = configs['Global'] cfg_data = configs['Data'] for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['mode'] = mode cfg_data[mode]['collate_fn'].setdefault( 'tokenizer_type', cfg_data[mode]['dataset']['tokenizer_type']) return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") with LazyGuard(): model = ErnieForSequenceClassificationAuto( ErnieModelAuto(**model_setting)) return model def input_spec(self): input_spec = [ paddle.static.InputSpec( shape=[None, None], dtype="int64", name='input_ids'), paddle.static.InputSpec( shape=[None, None], dtype="int64", name='token_type_ids') ] return input_spec ================================================ FILE: ppfleetx/models/language_model/ernie/auto/auto_transformer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # TODO: define the classes of Transformer neural network import copy import collections import numpy as np import paddle import paddle.nn.functional as F import paddle.nn as nn import paddle.distributed.auto_parallel as auto from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer import paddle.tensor as tensor from paddle.fluid import layers from paddle import ParamAttr from paddle.fluid.data_feeder import convert_dtype from ..layers.model_outputs import BaseModelOutputWithPastAndCrossAttentions __all__ = [] def _convert_param_attr_to_list(param_attr, n): """ If `param_attr` is a list or tuple, convert every element in it to a ParamAttr instance. Otherwise, repeat `param_attr` `n` times to construct a list, and rename every one by appending a increasing index suffix to avoid having same names when `param_attr` contains a name. Parameters: param_attr (list|tuple|ParamAttr): A list, tuple or something can be converted to a ParamAttr instance by `ParamAttr._to_attr`. n (int): The times to repeat to construct a list when `param_attr` is not a list or tuple. Returns: list: A list composed of each including cell's `param_attr`. """ if isinstance(param_attr, (list, tuple)): assert len(param_attr) == n, ( "length of param_attr should be %d when it is a list/tuple" % n) param_attrs = [] for attr in param_attr: if isinstance(attr, bool): if attr: param_attrs.append(ParamAttr._to_attr(None)) else: param_attrs.append(False) else: param_attrs.append(ParamAttr._to_attr(attr)) # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] elif isinstance(param_attr, bool): param_attrs = [] if param_attr: param_attrs = [ParamAttr._to_attr(None) for i in range(n)] else: param_attrs = [False] * n else: param_attrs = [] attr = ParamAttr._to_attr(param_attr) for i in range(n): attr_i = copy.deepcopy(attr) if attr.name: attr_i.name = attr_i.name + "_" + str(i) param_attrs.append(attr_i) return param_attrs def _convert_attention_mask(attn_mask, dtype): """ Convert the attention mask to the target dtype we expect. Parameters: attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. dtype (VarType): The target type of `attn_mask` we expect. Returns: Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. """ if attn_mask is not None and attn_mask.dtype != dtype: attn_mask_dtype = convert_dtype(attn_mask.dtype) if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype: attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9 else: attn_mask = paddle.cast(attn_mask, dtype) return attn_mask class MultiHeadAttention(Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. Please refer to `Attention Is All You Need `_ for more details. Parameters: embed_dim (int): The expected feature size in the input and output. num_heads (int): The number of heads in multi-head attention. dropout (float, optional): The dropout probability used on attention weights to drop some attention targets. 0 for no dropout. Default 0 kdim (int, optional): The feature size in key. If None, assumed equal to `embed_dim`. Default None. vdim (int, optional): The feature size in value. If None, assumed equal to `embed_dim`. Default None. need_weights (bool, optional): Indicate whether to return the attention weights. Default False. weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. Default: None, which means the default bias parameter property is used. If it is set to False, this layer will not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Examples: .. code-block:: python import paddle # encoder input: [batch_size, sequence_length, d_model] query = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, num_heads, query_len, query_len] attn_mask = paddle.rand((2, 2, 4, 4)) multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] """ Cache = collections.namedtuple("Cache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None, mesh=None, mesh_idx=None): super(MultiHeadAttention, self).__init__() assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim)) assert num_heads > 0, ("Expected num_heads to be greater than 0, " "but received {}".format(num_heads)) self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.mesh = mesh self.mesh_idx = mesh_idx self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.q_proj = Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) self.k_proj = Linear( self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) self.v_proj = Linear( self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) self.out_proj = Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) def _prepare_qkv(self, query, key, value, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and `v` fields would be used as calculated results on `key` and `value`, which mostly used for decoder-encoder cross attention. It is only used for inference and should be None for training. Default None. Returns: tuple: A tuple including linear projected keys and values. These two \ tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ and `[batch_size, n_head, sequence_length, d_value]` separately, \ and their data types are same as inputs. """ auto.shard_tensor(self.q_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) q = self.q_proj(query) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=2) v = tensor.concat([cache.v, v], axis=2) cache = self.Cache(k, v) return (q, k, v) if cache is None else (q, k, v, cache) def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. Parameters: key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, kdim]`. The data type should be float32 or float64. value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, vdim]`. The data type should be float32 or float64. Returns: tuple: A tuple including transformed keys and values. Their shapes \ both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \ and their data types are same as inputs. """ auto.shard_tensor(self.k_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) auto.shard_tensor(self.v_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) k = self.k_proj(key) v = self.v_proj(value) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) return k, v def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields, and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If the generated cache is an instance of `Cache`, `k` and `v` fields reserve intermediate result tensors of previous positions, and the tensors are incremental among decoding steps, which mostly are used for decoder decoder self attention. If the generated cache is an instance of `StaticCache`, `k` and `v` fields would be used as calculated result tensors on keys an values in `forward`, and the tensors keep unchanged among decoding steps, which are mostly used for decoder-encoder cross attention. The cache is generated as follows: 1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the results to create an instance of `StaticCache`. 2. If `type` is `Cache` and `value` is None, generate empty tensors shaped `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results to create an instance of `Cache`, where `batch_size` is from the first dimension of `key`. 3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create an instance of `Cache`. Parameters: key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If `value` is None, it is only for batch size and data type reference. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, `key` is only for batch size reference. Default None. type (type): It should be `MultiHeadAttention.StaticCache` or `MultiHeadAttention.Cache` to indicate the cache type to generate. Returns: namedtuple: an instance of `Cache` or `StaticCache` accordingly. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) def forward(self, query, key=None, value=None, attn_mask=None, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor, optional): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If it is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and `v` fields would be used as calculated results on `key` and `value`, which mostly used for decoder-encoder cross attention. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. Or a tuple if \ `need_weights` is True or `cache` is not None. If `need_weights` \ is True, except for attention output, the tuple also includes \ the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ If `cache` is not None, the tuple then includes the new cache \ having the same type as `cache`, and if it is `StaticCache`, it \ is same as the input `cache`, if it is `Cache`, the new cache \ reserves tensors concatanating raw tensors with intermediate \ results of current query. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if cache is None: q, k, v = self._prepare_qkv(query, key, value, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, cache) # scale dot product attention product = paddle.matmul( x=q * (self.head_dim**-0.5), y=k, transpose_y=True) if attn_mask is not None: # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, product.dtype) product = product + attn_mask weights = F.softmax(product) if self.dropout: # with get_rng_state_tracker().rng_state('local_seed'): weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) auto.shard_tensor(self.out_proj.weight, self.mesh[self.mesh_idx], [self.mesh.mp, None]) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if cache is not None: outs.append(cache) return out if len(outs) == 1 else tuple(outs) class TransformerEncoderLayer(Layer): """ TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) attention and feedforward network. Before and after each sub-layer, pre-process and post-precess would be applied on the input and output accordingly. If `normalize_before` is True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Parameters: d_model (int): The expected feature size in the input and output. nhead (int): The number of heads in multi-head attention(MHA). dim_feedforward (int): The hidden layer size in the feedforward network(FFN). dropout (float, optional): The dropout probability used in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 activation (str, optional): The activation function in the feedforward network. Default relu. attn_dropout (float, optional): The dropout probability used in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN activition. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property. If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property. If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. The `False` value means the corresponding layer would not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Default: None, which means the default bias parameter property is used. Examples: .. code-block:: python import paddle from paddle.nn import TransformerEncoderLayer # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = TransformerEncoderLayer(128, 2, 512) enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False, weight_attr=None, bias_attr=None, mesh=None, mesh_idx=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerEncoderLayer, self).__init__() assert d_model > 0, ("Expected d_model to be greater than 0, " "but received {}".format(d_model)) assert nhead > 0, ("Expected nhead to be greater than 0, " "but received {}".format(nhead)) assert dim_feedforward > 0, ( "Expected dim_feedforward to be greater than 0, " "but received {}".format(dim_feedforward)) attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.mesh = mesh self.mesh_idx = mesh_idx weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], mesh=mesh, mesh_idx=mesh_idx) self.linear1 = Linear( d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1]) self.dropout = Dropout(act_dropout, mode="upscale_in_train") self.linear2 = Linear( dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) def forward(self, src, src_mask=None, cache=None, output_attentions=False): r""" Applies a Transformer encoder layer on the input. Parameters: src (Tensor): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. See `TransformerEncoderLayer.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `enc_input`, representing the output of Transformer encoder \ layer. Or a tuple if `cache` is not None, except for encoder \ layer output, the tuple includes the new cache which is same \ as input `cache` argument but `incremental_cache` has an \ incremental length. See `MultiHeadAttention.gen_cache` and \ `MultiHeadAttention.forward` for more details. """ self.self_attn.need_weights = output_attentions src_mask = _convert_attention_mask(src_mask, src.dtype) auto.shard_tensor(self.linear1.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) auto.shard_tensor(self.linear2.weight, self.mesh[self.mesh_idx], [self.mesh.mp, None]) residual = src if self.normalize_before: src = self.norm1(src) attn_outputs = self.self_attn(src, src, src, src_mask, cache) if isinstance(attn_outputs, tuple): src = attn_outputs[0] outputs = attn_outputs[1:] else: src = attn_outputs outputs = None src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) src = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src if outputs is None else ( (src, ) + outputs[::-1]) # hidden_states, cache, attentions def gen_cache(self, src): r""" Generates cache for `forward` usage. The generated cache is an instance of `MultiHeadAttention.Cache`. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. Returns: incremental_cache: It is an instance of `MultiHeadAttention.Cache` \ produced by `self_attn.gen_cache`, it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. See \ `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ incremental_cache = self.self_attn.gen_cache( src, type=self.self_attn.Cache) return incremental_cache class TransformerEncoder(Layer): """ TransformerEncoder is a stack of N encoder layers. Parameters: encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It would be used as the first layer, and the other layers would be created according to the configurations of it. num_layers (int): The number of encoder layers to be stacked. norm (LayerNorm, optional): the layer normalization component. If provided, apply layer normalization on the output of last encoder layer. Examples: .. code-block:: python import paddle from paddle.nn import TransformerEncoderLayer, TransformerEncoder # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = TransformerEncoderLayer(128, 2, 512) encoder = TransformerEncoder(encoder_layer, 2) enc_output = encoder(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, encoder_layer, num_layers, norm=None, enable_recompute=False, mesh=None): super(TransformerEncoder, self).__init__() self.stages = mesh.stages(num_layers) self.layers = nn.LayerList() for i in range(num_layers): if i == 0: self.layers.append(encoder_layer) else: encoder_layer._config.update({ "mesh": mesh, "mesh_idx": self.stages[i] }) self.layers.append( type(encoder_layer)(**encoder_layer._config)) self.num_layers = num_layers self.norm = norm self.enable_recompute = enable_recompute def forward(self, src, src_mask=None, cache=None, output_attentions=False, output_hidden_states=False, return_dict=False): r""" Applies a stack of N Transformer encoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last encoder layer. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (list, optional): It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoder.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `src`, representing the output of Transformer encoder. \ Or a tuple if `cache` is not None, except for encoder output, \ the tuple includes the new cache which is same as input `cache` \ argument but `incremental_cache` in it has an incremental length. \ See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ src_mask = _convert_attention_mask(src_mask, src.dtype) output = src # To get cache from None when use_cache is True, which is compatible with HF # while HF requires decoder. The implementation here uses cache update in the # MultiHeadAttention not so efficiently, and maybe optimize it later. if cache is None and getattr(self, "_use_cache", False): cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers) # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts # to True when cache is not None. new_caches = [] if cache is not None and getattr(self, "_use_cache", True) else None all_attentions = [] if output_attentions else None # NOTE: Also includes embeding output which is same as HF. all_hidden_states = [output] if output_hidden_states else None for i, mod in enumerate(self.layers): auto.shard_tensor( output, mod.mesh[mod.mesh_idx], [mod.mesh.dp] + [None for i in range(len(output.shape) - 1)]) if self.enable_recompute: layer_outputs = auto.recompute(mod)( output, src_mask, None if cache is None else cache[i] if isinstance(cache[i], MultiHeadAttention.Cache) else MultiHeadAttention.Cache(*cache[i]), output_attentions) else: layer_outputs = mod( output, src_mask=src_mask, cache=None if cache is None else cache[i] if isinstance(cache[i], MultiHeadAttention.Cache) else MultiHeadAttention.Cache(*cache[i]), output_attentions=output_attentions) if isinstance(layer_outputs, tuple): output = layer_outputs[0] outputs = layer_outputs[1:] else: output = layer_outputs outputs = None if output_hidden_states: all_hidden_states.append(output) if output_attentions: all_attentions.append(outputs[-1]) if new_caches is not None: new_caches.append(outputs[0] if isinstance(cache[ i], MultiHeadAttention.Cache) else (tuple(outputs[0]))) if self.norm is not None: output = self.norm(output) if output_hidden_states: all_hidden_states[-1] = output if not return_dict: return output return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=output, past_key_values=new_caches, hidden_states=all_hidden_states, attentions=all_attentions) def gen_cache(self, src): r""" Generates cache for `forward` usage. The generated cache is a list, and each element in it is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache` for more details. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. Returns: list: It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache` for more details. """ cache = [layer.gen_cache(src) for layer in self.layers] return cache ================================================ FILE: ppfleetx/models/language_model/ernie/dygraph/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/ernie/dygraph/hybrid_model.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import io import copy import logging import json import paddle import paddle.nn as nn from paddle.nn import functional as F from dataclasses import dataclass, field from ..layers.model_outputs import ( BaseModelOutputWithPoolingAndCrossAttentions, ModelOutput, ErnieForPreTrainingOutput, SequenceClassifierOutput, ) from ..layers.distributed_transformer import TransformerEncoderLayer, TransformerEncoder from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc from ppfleetx.distributed.apis import env def parallel_matmul(lm_output, logit_weights, parallel_output): """ """ hcg = env.get_hcg() model_parallel_group = hcg.get_model_parallel_group() world_size = hcg.get_model_parallel_world_size() rank = hcg.get_model_parallel_rank() if world_size > 1: input_parallel = paddle.distributed.collective._c_identity( lm_output, group=model_parallel_group) logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True) if parallel_output: return logits return paddle.distributed.collective._c_concat( logits, group=model_parallel_group) else: logits = paddle.matmul(lm_output, logit_weights, transpose_y=True) return logits class ErnieEmbeddings(nn.Layer): r""" Include embeddings from word, position and token_type embeddings. """ def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, pad_token_id=0, weight_attr=None, task_type_vocab_size=3, task_id=0, use_task_id=False): super(ErnieEmbeddings, self).__init__() # self.word_embeddings = nn.Embedding( # vocab_size, # hidden_size, # padding_idx=pad_token_id, # weight_attr=weight_attr) self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding( vocab_size, hidden_size, weight_attr=weight_attr) self.position_embeddings = nn.Embedding( max_position_embeddings, hidden_size, weight_attr=weight_attr) self.type_vocab_size = type_vocab_size if self.type_vocab_size > 0: self.token_type_embeddings = nn.Embedding( type_vocab_size, hidden_size, weight_attr=weight_attr) self.use_task_id = use_task_id self.task_id = task_id if self.use_task_id: self.task_type_embeddings = nn.Embedding( task_type_vocab_size, hidden_size, weight_attr=weight_attr) self.layer_norm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None, position_ids=None, task_type_ids=None, inputs_embeds=None, past_key_values_length=None): if input_ids is not None: input_shape = paddle.shape(input_ids) input_embeddings = self.word_embeddings(input_ids) else: input_shape = paddle.shape(inputs_embeds)[:-1] input_embeddings = inputs_embeds if position_ids is None: # maybe need use shape op to unify static graph and dynamic graph #seq_length = input_ids.shape[1] ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) position_ids = seq_length - ones if past_key_values_length is not None: position_ids += past_key_values_length position_ids.stop_gradient = True position_embeddings = self.position_embeddings(position_ids) embeddings = input_embeddings + position_embeddings if self.type_vocab_size > 0: if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype="int64") token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = embeddings + token_type_embeddings if self.use_task_id: if task_type_ids is None: task_type_ids = paddle.ones( input_shape, dtype="int64") * self.task_id task_type_embeddings = self.task_type_embeddings(task_type_ids) embeddings = embeddings + task_type_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings class ErniePooler(nn.Layer): def __init__(self, hidden_size, weight_attr=None): super(ErniePooler, self).__init__() self.dense = nn.Linear( hidden_size, hidden_size, weight_attr=weight_attr) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class ErnieModelHybrid(nn.Layer): r""" The bare ERNIE Model transformer outputting raw hidden-states. This model is a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Args: vocab_size (int): Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`. hidden_size (int, optional): Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`. num_hidden_layers (int, optional): Number of hidden layers in the Transformer encoder. Defaults to `12`. num_attention_heads (int, optional): Number of attention heads for each attention layer in the Transformer encoder. Defaults to `12`. intermediate_size (int, optional): Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to ff layers are firstly projected from `hidden_size` to `intermediate_size`, and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`. Defaults to `3072`. hidden_act (str, optional): The non-linear activation function in the feed-forward layer. ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported. Defaults to `"gelu"`. hidden_dropout_prob (float, optional): The dropout probability for all fully connected layers in the embeddings and encoder. Defaults to `0.1`. attention_probs_dropout_prob (float, optional): The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target. Defaults to `0.1`. max_position_embeddings (int, optional): The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input sequence. Defaults to `512`. type_vocab_size (int, optional): The vocabulary size of the `token_type_ids`. Defaults to `2`. initializer_range (float, optional): The standard deviation of the normal initializer for initializing all weight matrices. Defaults to `0.02`. .. note:: A normal_initializer initializes weight matrices as normal distributions. See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`. pad_token_id(int, optional): The index of padding token in the token vocabulary. Defaults to `0`. """ def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0, task_type_vocab_size=3, task_id=0, use_task_id=False, use_recompute=False, num_partitions=1): super(ErnieModelHybrid, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.initializer_range)) self.embeddings = ErnieEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, pad_token_id, weight_attr, task_type_vocab_size, task_id, use_task_id) encoder_layer = TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, weight_attr=weight_attr, normalize_before=False, num_partitions=num_partitions) self.encoder = TransformerEncoder( encoder_layer, num_hidden_layers, enable_recompute=use_recompute) self.pooler = ErniePooler(hidden_size, weight_attr) self.apply(self.init_weights) def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, task_type_ids=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" Args: input_ids (Tensor): Indices of input sequence tokens in the vocabulary. They are numerical representations of tokens that build the input sequence. It's data type should be `int64` and has a shape of [batch_size, sequence_length]. token_type_ids (Tensor, optional): Segment token indices to indicate different portions of the inputs. Selected in the range ``[0, type_vocab_size - 1]``. If `type_vocab_size` is 2, which means the inputs have two portions. Indices can either be 0 or 1: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. Defaults to `None`, which means we don't add segment embeddings. position_ids (Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, max_position_embeddings - 1]``. Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`. attention_mask (Tensor, optional): Mask used in multi-head attention to avoid performing attention on to some unwanted positions, usually the paddings or the subsequent positions. Its data type can be int, float and bool. When the data type is bool, the `masked` tokens have `False` values and the others have `True` values. When the data type is int, the `masked` tokens have `0` values and the others have `1` values. When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values. It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`. For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length], [batch_size, num_attention_heads, sequence_length, sequence_length]. We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word, "使" and "用" will have the same value. Defaults to `None`, which means nothing needed to be prevented attention to. inputs_embeds (Tensor, optional): If you want to control how to convert `inputs_ids` indices into associated vectors, you can pass an embedded representation directly instead of passing `inputs_ids`. past_key_values (tuple(tuple(Tensor)), optional): The length of tuple equals to the number of layers, and each inner tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`) which contains precomputed key and value hidden states of the attention blocks. If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` of shape `(batch_size, sequence_length)`. use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are returned. Defaults to `None`. output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`. """ if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time." ) elif input_ids is not None: input_shape = paddle.shape(input_ids) elif inputs_embeds is not None: input_shape = paddle.shape(inputs_embeds)[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") past_key_values_length = None if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] if attention_mask is None: attention_mask = paddle.unsqueeze( (input_ids == self.pad_token_id ).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]) if past_key_values is not None: batch_size = past_key_values[0][0].shape[0] past_mask = paddle.zeros( [batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) attention_mask = paddle.concat( [past_mask, attention_mask], axis=-1) # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze( attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, task_type_ids=task_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length) self.encoder._use_cache = use_cache # To be consistent with HF encoder_outputs = self.encoder( embedding_output, src_mask=attention_mask, cache=past_key_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) if isinstance(encoder_outputs, type(embedding_output)): sequence_output = encoder_outputs pooled_output = self.pooler(sequence_output) return (sequence_output, pooled_output) else: sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): # only support dygraph, use truncated_normal and make it inplace # and configurable later if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 class ErnieLMPredictionHead(nn.Layer): r""" Ernie Model with a `language modeling` head on top. """ def __init__( self, hidden_size, vocab_size, activation, embedding_weights=None, weight_attr=None, ): super(ErnieLMPredictionHead, self).__init__() self.transform = nn.Linear( hidden_size, hidden_size, weight_attr=weight_attr) self.activation = getattr(nn.functional, activation) self.layer_norm = nn.LayerNorm(hidden_size) # TODO(shenliang03): to support shared weights in future self.decoder_weight = self.create_parameter( shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, attr=weight_attr, is_bias=False) # if embedding_weights is None else embedding_weights self.decoder_bias = self.create_parameter( shape=[self.decoder_weight.shape[0]], dtype=self.decoder_weight.dtype, is_bias=True) def forward(self, hidden_states, masked_positions=None): if masked_positions is not None: hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]]) hidden_states = paddle.tensor.gather(hidden_states, masked_positions) # gather masked tokens might be more quick hidden_states = self.transform(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.layer_norm(hidden_states) # hidden_states = parallel_matmul(hidden_states, self.decoder_weight, True) + self.decoder_bias hidden_states = paddle.matmul( hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias return hidden_states class ErniePretrainingHeads(nn.Layer): def __init__( self, hidden_size, vocab_size, activation, embedding_weights=None, weight_attr=None, ): super(ErniePretrainingHeads, self).__init__() self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size, activation, embedding_weights, weight_attr) self.seq_relationship = nn.Linear( hidden_size, 2, weight_attr=weight_attr) def forward(self, sequence_output, pooled_output, masked_positions=None): prediction_scores = self.predictions(sequence_output, masked_positions) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class ErnieForPretrainingHybrid(nn.Layer): r""" Ernie Model with a `masked language modeling` head and a `sentence order prediction` head on top. """ def __init__(self, ernie): super(ErnieForPretrainingHybrid, self).__init__() self.ernie = ernie weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.ernie.initializer_range)) self.cls = ErniePretrainingHeads( self.ernie.hidden_size, self.ernie.vocab_size, self.ernie.hidden_act, embedding_weights=self.ernie.embeddings.word_embeddings.weight, weight_attr=weight_attr, ) self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, masked_positions=None, inputs_embeds=None, labels=None, next_sentence_label=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" Args: input_ids (Tensor): See :class:`ErnieModel`. token_type_ids (Tensor, optional): See :class:`ErnieModel`. position_ids (Tensor, optional): See :class:`ErnieModel`. attention_mask (Tensor, optional): See :class:`ErnieModel`. inputs_embeds(Tensor, optional): See :class:`ErnieModel`. labels (Tensor of shape `(batch_size, sequence_length)`, optional): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. next_sentence_label (Tensor of shape `(batch_size,)`, optional): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`: - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`. """ # with paddle.static.amp.fp16_guard(): outputs = self.ernie( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls( sequence_output, pooled_output, masked_positions) total_loss = None if labels is not None and next_sentence_label is not None: if env.get_hcg().get_model_parallel_world_size > 1 and paddle.is_compiled_with_cuda(): loss_fct = fleet.meta_parallel.ParallelCrossEntropy() else: loss_fct = paddle.nn.CrossEntropyLoss() masked_lm_loss = loss_fct( prediction_scores.reshape( (-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1, ))) next_sentence_loss = loss_fct( seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1, ))) total_loss = masked_lm_loss + next_sentence_loss if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ( (total_loss, ) + output) if total_loss is not None else output return ErnieForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): # only support dygraph, use truncated_normal and make it inplace # and configurable later if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 class ErniePretrainingCriterionHybrid(paddle.nn.Layer): r""" The loss output of Ernie Model during the pretraining: a `masked language modeling` head and a `next sentence prediction (classification)` head. """ def __init__(self, with_nsp_loss=True): super(ErniePretrainingCriterionHybrid, self).__init__() self.with_nsp_loss = with_nsp_loss def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels=None): """ Args: prediction_scores(Tensor): The scores of masked token prediction. Its data type should be float32. If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. Otherwise, its shape is [batch_size, mask_token_num, vocab_size] seq_relationship_score(Tensor): The scores of next sentence prediction. Its data type should be float32 and its shape is [batch_size, 2] masked_lm_labels(Tensor): The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`. Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1]. Otherwise, its shape is [batch_size, mask_token_num, 1] next_sentence_labels(Tensor): The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels` is equal to `seq_relation_labels`. Its data type should be int64 and its shape is [batch_size, 1] Returns: Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`. Its data type should be float32 and its shape is [1]. """ # with paddle.static.amp.fp16_guard(): # hcg = env.get_hcg() # mp_size = hcg.get_model_parallel_world_size() # if mp_size > 1: # mask = (masked_lm_labels == -1) # masked_lm_labels[mask] = 0 # masked_lm_loss = self.parallel_loss_func( # prediction_scores, masked_lm_labels) # masked_lm_loss[mask] = 0. # else: # masked_lm_loss = self.loss_func(prediction_scores, # masked_lm_labels, # ignore_index=-1) masked_lm_loss = F.cross_entropy( prediction_scores, masked_lm_labels, ignore_index=-1, reduction='none') if not self.with_nsp_loss: return paddle.mean(masked_lm_loss) next_sentence_loss = F.cross_entropy( seq_relationship_score, next_sentence_labels, reduction='none') return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss) # these Layers is just for PipelineParallel class EmbeddingsPipe(ErnieEmbeddings): @property def embedding_weight(self): return self.word_embeddings.weight def forward(self, tensors): input_ids, token_type_ids, attention_mask = tensors past_key_values_length = None if attention_mask is None: attention_mask = paddle.unsqueeze( (input_ids == self.pad_token_id ).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]) if past_key_values is not None: batch_size = past_key_values[0][0].shape[0] past_mask = paddle.zeros( [batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) attention_mask = paddle.concat( [past_mask, attention_mask], axis=-1) # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze( attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True embeddings = super().forward( input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, task_type_ids=None, inputs_embeds=None, past_key_values_length=past_key_values_length) return attention_mask, embeddings class TransformerEncoderLayerPipe(TransformerEncoderLayer): def forward(self, tensors): attention_mask, inputs = tensors outputs = super().forward(src=inputs, src_mask=attention_mask) return attention_mask, outputs class LayerNormPipe(nn.LayerNorm): def forward(self, tensors): _, inputs = tensors output = super().forward(inputs) return output class ErniePoolerPipe(ErniePooler): def forward(self, args): sequence_output = args pooled_output = super().forward(sequence_output) return sequence_output, pooled_output class ErniePretrainingCriterionPipe(ErniePretrainingCriterionHybrid): def __init__(self, *heads_args, **heads_kargs): super(ErniePretrainingCriterionPipe, self).__init__() self.heads = ErniePretrainingHeads(*heads_args, **heads_kargs) def forward(self, outputs, data): sequence_output, pooled_output = outputs masked_lm_positions, masked_lm_labels, next_sentence_labels = data prediction_scores, seq_relationship_score = self.heads( sequence_output, pooled_output, masked_lm_positions) lm_loss, sop_loss = super().forward( prediction_scores=prediction_scores, seq_relationship_score=seq_relationship_score, masked_lm_labels=masked_lm_labels, next_sentence_labels=next_sentence_labels) return lm_loss + sop_loss class ErnieForPretrainingPipe(PipelineLayer): def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0, task_type_vocab_size=3, task_id=0, use_task_id=False, use_recompute=False, num_partitions=1): self.descs = [] self.descs.append( LayerDesc( EmbeddingsPipe, vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, pad_token_id=pad_token_id, weight_attr=None, task_type_vocab_size=task_type_vocab_size, task_id=task_id, use_task_id=use_task_id)) for _ in range(num_hidden_layers): self.descs.append( LayerDesc( TransformerEncoderLayerPipe, d_model=hidden_size, nhead=num_attention_heads, dim_feedforward=intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=hidden_dropout_prob, normalize_before=False, weight_attr=None, bias_attr=None, num_partitions=num_partitions)) self.descs.append( LayerDesc( LayerNormPipe, normalized_shape=hidden_size)) self.descs.append(LayerDesc(ErniePoolerPipe, hidden_size=hidden_size)) loss_fun = ErniePretrainingCriterionPipe( hidden_size=hidden_size, vocab_size=vocab_size, activation=hidden_act, embedding_weights=None, weight_attr=paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=initializer_range))) super().__init__( layers=self.descs, loss_fn=loss_fun, topology=env.get_hcg().topology(), seg_method="layer:TransformerEncoderLayer", recompute_interval=1 if use_recompute else 0, recompute_ctx={ "mp_group": env.get_hcg().get_model_parallel_group(), "offload": False, "partition": False }) class ErnieForSequenceClassificationHybrid(nn.Layer): """ Ernie Model with a linear layer on top of the output layer, designed for sequence classification/regression tasks like GLUE tasks. Args: ernie (:class:`ErnieModel`): An instance of ErnieModel. num_classes (int, optional): The number of classes. Defaults to `2`. dropout (float, optional): The dropout probability for output of ERNIE. If None, use the same value as `hidden_dropout_prob` of `ErnieModel` instance `ernie`. Defaults to None. """ def __init__(self, ernie, num_classes=2, dropout=None): super(ErnieForSequenceClassificationHybrid, self).__init__() self.num_classes = num_classes self.ernie = ernie # allow ernie to be config self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.hidden_dropout_prob) self.classifier = nn.Linear(self.ernie.hidden_size, num_classes) self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" The ErnieForSequenceClassification forward method, overrides the __call__() special method. Args: input_ids (Tensor): See :class:`ErnieModelHybrid`. token_type_ids (Tensor, optional): See :class:`ErnieModelHybrid`. position_ids(Tensor, optional): See :class:`ErnieModelHybrid`. attention_mask (Tensor, optional): See :class:`ErnieModelHybrid`. labels (Tensor of shape `(batch_size,)`, optional): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1` a regression loss is computed (Mean-Square loss), If `num_classes > 1` a classification loss is computed (Cross-Entropy). output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`. """ outputs = self.ernie( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.num_classes == 1: loss_fct = paddle.nn.MSELoss() loss = loss_fct(logits, labels) elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32: loss_fct = paddle.nn.CrossEntropyLoss() loss = loss_fct( logits.reshape((-1, self.num_classes)), labels.reshape((-1, ))) else: loss_fct = paddle.nn.BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else ( output[0] if len(output) == 1 else output) return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 ================================================ FILE: ppfleetx/models/language_model/ernie/dygraph/single_model.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import io import copy import logging import json import paddle import paddle.nn as nn from paddle.nn import functional as F from dataclasses import dataclass, field from ..layers.model_outputs import ( BaseModelOutputWithPoolingAndCrossAttentions, ModelOutput, ErnieForPreTrainingOutput, SequenceClassifierOutput, ) from ..layers.transformer import TransformerEncoderLayer, TransformerEncoder class ErnieEmbeddings(nn.Layer): r""" Include embeddings from word, position and token_type embeddings. """ def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, pad_token_id=0, weight_attr=None, task_type_vocab_size=3, task_id=0, use_task_id=False): super(ErnieEmbeddings, self).__init__() self.word_embeddings = nn.Embedding( vocab_size, hidden_size, padding_idx=pad_token_id, weight_attr=weight_attr) self.position_embeddings = nn.Embedding( max_position_embeddings, hidden_size, weight_attr=weight_attr) self.type_vocab_size = type_vocab_size if self.type_vocab_size > 0: self.token_type_embeddings = nn.Embedding( type_vocab_size, hidden_size, weight_attr=weight_attr) self.use_task_id = use_task_id self.task_id = task_id if self.use_task_id: self.task_type_embeddings = nn.Embedding( task_type_vocab_size, hidden_size, weight_attr=weight_attr) self.layer_norm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None, position_ids=None, task_type_ids=None, inputs_embeds=None, past_key_values_length=None): if input_ids is not None: input_shape = paddle.shape(input_ids) input_embeddings = self.word_embeddings(input_ids) else: input_shape = paddle.shape(inputs_embeds)[:-1] input_embeddings = inputs_embeds if position_ids is None: # maybe need use shape op to unify static graph and dynamic graph #seq_length = input_ids.shape[1] ones = paddle.ones(input_shape, dtype="int64") seq_length = paddle.cumsum(ones, axis=1) position_ids = seq_length - ones if past_key_values_length is not None: position_ids += past_key_values_length position_ids.stop_gradient = True position_embeddings = self.position_embeddings(position_ids) embeddings = input_embeddings + position_embeddings if self.type_vocab_size > 0: if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype="int64") token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = embeddings + token_type_embeddings if self.use_task_id: if task_type_ids is None: task_type_ids = paddle.ones( input_shape, dtype="int64") * self.task_id task_type_embeddings = self.task_type_embeddings(task_type_ids) embeddings = embeddings + task_type_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings class ErniePooler(nn.Layer): def __init__(self, hidden_size, weight_attr=None): super(ErniePooler, self).__init__() self.dense = nn.Linear( hidden_size, hidden_size, weight_attr=weight_attr) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class ErnieModel(nn.Layer): r""" The bare ERNIE Model transformer outputting raw hidden-states. This model is a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Args: vocab_size (int): Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`. hidden_size (int, optional): Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`. num_hidden_layers (int, optional): Number of hidden layers in the Transformer encoder. Defaults to `12`. num_attention_heads (int, optional): Number of attention heads for each attention layer in the Transformer encoder. Defaults to `12`. intermediate_size (int, optional): Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to ff layers are firstly projected from `hidden_size` to `intermediate_size`, and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`. Defaults to `3072`. hidden_act (str, optional): The non-linear activation function in the feed-forward layer. ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported. Defaults to `"gelu"`. hidden_dropout_prob (float, optional): The dropout probability for all fully connected layers in the embeddings and encoder. Defaults to `0.1`. attention_probs_dropout_prob (float, optional): The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target. Defaults to `0.1`. max_position_embeddings (int, optional): The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input sequence. Defaults to `512`. type_vocab_size (int, optional): The vocabulary size of the `token_type_ids`. Defaults to `2`. initializer_range (float, optional): The standard deviation of the normal initializer for initializing all weight matrices. Defaults to `0.02`. .. note:: A normal_initializer initializes weight matrices as normal distributions. See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`. pad_token_id(int, optional): The index of padding token in the token vocabulary. Defaults to `0`. """ def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pad_token_id=0, task_type_vocab_size=3, task_id=0, use_task_id=False, use_recompute=False): super(ErnieModel, self).__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.initializer_range)) self.embeddings = ErnieEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, pad_token_id, weight_attr, task_type_vocab_size, task_id, use_task_id) encoder_layer = TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=0, weight_attr=weight_attr, normalize_before=False) self.encoder = TransformerEncoder( encoder_layer, num_hidden_layers, enable_recompute=use_recompute) self.pooler = ErniePooler(hidden_size, weight_attr) self.apply(self.init_weights) def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, task_type_ids=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" Args: input_ids (Tensor): Indices of input sequence tokens in the vocabulary. They are numerical representations of tokens that build the input sequence. It's data type should be `int64` and has a shape of [batch_size, sequence_length]. token_type_ids (Tensor, optional): Segment token indices to indicate different portions of the inputs. Selected in the range ``[0, type_vocab_size - 1]``. If `type_vocab_size` is 2, which means the inputs have two portions. Indices can either be 0 or 1: - 0 corresponds to a *sentence A* token, - 1 corresponds to a *sentence B* token. Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. Defaults to `None`, which means we don't add segment embeddings. position_ids (Tensor, optional): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, max_position_embeddings - 1]``. Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`. attention_mask (Tensor, optional): Mask used in multi-head attention to avoid performing attention on to some unwanted positions, usually the paddings or the subsequent positions. Its data type can be int, float and bool. When the data type is bool, the `masked` tokens have `False` values and the others have `True` values. When the data type is int, the `masked` tokens have `0` values and the others have `1` values. When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values. It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`. For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length], [batch_size, num_attention_heads, sequence_length, sequence_length]. We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word, "使" and "用" will have the same value. Defaults to `None`, which means nothing needed to be prevented attention to. inputs_embeds (Tensor, optional): If you want to control how to convert `inputs_ids` indices into associated vectors, you can pass an embedded representation directly instead of passing `inputs_ids`. past_key_values (tuple(tuple(Tensor)), optional): The length of tuple equals to the number of layers, and each inner tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`) which contains precomputed key and value hidden states of the attention blocks. If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` of shape `(batch_size, sequence_length)`. use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are returned. Defaults to `None`. output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ModelOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`. """ if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time." ) elif input_ids is not None: input_shape = paddle.shape(input_ids) elif inputs_embeds is not None: input_shape = paddle.shape(inputs_embeds)[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") past_key_values_length = None if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] if attention_mask is None: attention_mask = paddle.unsqueeze( (input_ids == self.pad_token_id ).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]) if past_key_values is not None: batch_size = past_key_values[0][0].shape[0] past_mask = paddle.zeros( [batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) attention_mask = paddle.concat( [past_mask, attention_mask], axis=-1) # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze( attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, task_type_ids=task_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length) self.encoder._use_cache = use_cache # To be consistent with HF encoder_outputs = self.encoder( embedding_output, src_mask=attention_mask, cache=past_key_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) if isinstance(encoder_outputs, type(embedding_output)): sequence_output = encoder_outputs pooled_output = self.pooler(sequence_output) return (sequence_output, pooled_output) else: sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): # only support dygraph, use truncated_normal and make it inplace # and configurable later if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 class ErnieLMPredictionHead(nn.Layer): r""" Ernie Model with a `language modeling` head on top. """ def __init__( self, hidden_size, vocab_size, activation, embedding_weights=None, weight_attr=None, ): super(ErnieLMPredictionHead, self).__init__() self.transform = nn.Linear( hidden_size, hidden_size, weight_attr=weight_attr) self.activation = getattr(nn.functional, activation) self.layer_norm = nn.LayerNorm(hidden_size) self.decoder_weight = self.create_parameter( shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, attr=weight_attr, is_bias=False) if embedding_weights is None else embedding_weights self.decoder_bias = self.create_parameter( shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True) def forward(self, hidden_states, masked_positions=None): if masked_positions is not None: hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]]) hidden_states = paddle.tensor.gather(hidden_states, masked_positions) # gather masked tokens might be more quick hidden_states = self.transform(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.layer_norm(hidden_states) hidden_states = paddle.matmul( hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias return hidden_states class ErniePretrainingHeads(nn.Layer): def __init__( self, hidden_size, vocab_size, activation, embedding_weights=None, weight_attr=None, ): super(ErniePretrainingHeads, self).__init__() self.predictions = ErnieLMPredictionHead(hidden_size, vocab_size, activation, embedding_weights, weight_attr) self.seq_relationship = nn.Linear( hidden_size, 2, weight_attr=weight_attr) def forward(self, sequence_output, pooled_output, masked_positions=None): prediction_scores = self.predictions(sequence_output, masked_positions) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class ErnieForPretraining(nn.Layer): r""" Ernie Model with a `masked language modeling` head and a `sentence order prediction` head on top. """ def __init__(self, ernie): super(ErnieForPretraining, self).__init__() self.ernie = ernie weight_attr = paddle.ParamAttr( initializer=nn.initializer.TruncatedNormal( mean=0.0, std=self.ernie.initializer_range)) self.cls = ErniePretrainingHeads( self.ernie.hidden_size, self.ernie.vocab_size, self.ernie.hidden_act, embedding_weights=self.ernie.embeddings.word_embeddings.weight, weight_attr=weight_attr, ) self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, masked_positions=None, inputs_embeds=None, labels=None, next_sentence_label=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" Args: input_ids (Tensor): See :class:`ErnieModel`. token_type_ids (Tensor, optional): See :class:`ErnieModel`. position_ids (Tensor, optional): See :class:`ErnieModel`. attention_mask (Tensor, optional): See :class:`ErnieModel`. inputs_embeds(Tensor, optional): See :class:`ErnieModel`. labels (Tensor of shape `(batch_size, sequence_length)`, optional): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. next_sentence_label (Tensor of shape `(batch_size,)`, optional): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`: - 0 indicates sequence B is a continuation of sequence A, - 1 indicates sequence B is a random sequence. output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.ErnieForPreTrainingOutput`. """ # with paddle.static.amp.fp16_guard(): outputs = self.ernie( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls( sequence_output, pooled_output, masked_positions) total_loss = None if labels is not None and next_sentence_label is not None: loss_fct = paddle.nn.CrossEntropyLoss() masked_lm_loss = loss_fct( prediction_scores.reshape( (-1, paddle.shape(prediction_scores)[-1])), labels.reshape((-1, ))) next_sentence_loss = loss_fct( seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1, ))) total_loss = masked_lm_loss + next_sentence_loss if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ( (total_loss, ) + output) if total_loss is not None else output return ErnieForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): # only support dygraph, use truncated_normal and make it inplace # and configurable later if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 class ErniePretrainingCriterion(paddle.nn.Layer): r""" The loss output of Ernie Model during the pretraining: a `masked language modeling` head and a `next sentence prediction (classification)` head. """ def __init__(self, with_nsp_loss=True): super(ErniePretrainingCriterion, self).__init__() self.with_nsp_loss = with_nsp_loss #self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1) def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels=None): """ Args: prediction_scores(Tensor): The scores of masked token prediction. Its data type should be float32. If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. Otherwise, its shape is [batch_size, mask_token_num, vocab_size] seq_relationship_score(Tensor): The scores of next sentence prediction. Its data type should be float32 and its shape is [batch_size, 2] masked_lm_labels(Tensor): The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`. Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1]. Otherwise, its shape is [batch_size, mask_token_num, 1] next_sentence_labels(Tensor): The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels` is equal to `seq_relation_labels`. Its data type should be int64 and its shape is [batch_size, 1] Returns: Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`. Its data type should be float32 and its shape is [1]. """ with paddle.static.amp.fp16_guard(): masked_lm_loss = F.cross_entropy( prediction_scores, masked_lm_labels, ignore_index=-1, reduction='none') if not self.with_nsp_loss: return paddle.mean(masked_lm_loss) next_sentence_loss = F.cross_entropy( seq_relationship_score, next_sentence_labels, reduction='none') return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss) class ErnieForSequenceClassification(nn.Layer): """ Ernie Model with a linear layer on top of the output layer, designed for sequence classification/regression tasks like GLUE tasks. Args: ernie (:class:`ErnieModel`): An instance of ErnieModel. num_classes (int, optional): The number of classes. Defaults to `2`. dropout (float, optional): The dropout probability for output of ERNIE. If None, use the same value as `hidden_dropout_prob` of `ErnieModel` instance `ernie`. Defaults to None. """ def __init__(self, ernie, num_classes=2, dropout=None): super(ErnieForSequenceClassification, self).__init__() self.num_classes = num_classes self.ernie = ernie # allow ernie to be config self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.hidden_dropout_prob) self.classifier = nn.Linear(self.ernie.hidden_size, num_classes) self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None, output_hidden_states=False, output_attentions=False, return_dict=False): r""" The ErnieForSequenceClassification forward method, overrides the __call__() special method. Args: input_ids (Tensor): See :class:`ErnieModel`. token_type_ids (Tensor, optional): See :class:`ErnieModel`. position_ids(Tensor, optional): See :class:`ErnieModel`. attention_mask (Tensor, optional): See :class:`ErnieModel`. labels (Tensor of shape `(batch_size,)`, optional): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1` a regression loss is computed (Mean-Square loss), If `num_classes > 1` a classification loss is computed (Cross-Entropy). output_hidden_states (bool, optional): Whether to return the hidden states of all layers. Defaults to `False`. output_attentions (bool, optional): Whether to return the attentions tensors of all attention layers. Defaults to `False`. return_dict (bool, optional): Whether to return a :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` object. If `False`, the output will be a tuple of tensors. Defaults to `False`. Returns: An instance of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of :class:`~ppfleetx.models.language_model.ernie.layers.model_outputs.SequenceClassifierOutput`. """ outputs = self.ernie( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.num_classes == 1: loss_fct = paddle.nn.MSELoss() loss = loss_fct(logits, labels) elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32: loss_fct = paddle.nn.CrossEntropyLoss() loss = loss_fct( logits.reshape((-1, self.num_classes)), labels.reshape((-1, ))) else: loss_fct = paddle.nn.BCEWithLogitsLoss() loss = loss_fct(logits, labels) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else ( output[0] if len(output) == 1 else output) return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) def init_weights(self, layer): """ Initialization hook """ if isinstance(layer, (nn.Linear, nn.Embedding)): if isinstance(layer.weight, paddle.Tensor): layer.weight.set_value( paddle.tensor.normal( mean=0.0, std=self.initializer_range if hasattr(self, "initializer_range") else self.ernie.initializer_range, shape=layer.weight.shape)) elif isinstance(layer, nn.LayerNorm): layer._epsilon = 1e-12 ================================================ FILE: ppfleetx/models/language_model/ernie/ernie_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import copy import yaml import codecs from collections.abc import Mapping import paddle from paddle.static import InputSpec import paddle.nn as nn from ppfleetx.core.module.basic_module import BasicModule import ppfleetx.models.language_model.gpt as gpt from ppfleetx.utils.log import logger from .dygraph.single_model import ( ErnieModel, ErnieForPretraining, ErniePretrainingCriterion, ErnieForSequenceClassification, ) from .dygraph.hybrid_model import (ErnieModelHybrid, ErnieForPretrainingHybrid, ErniePretrainingCriterionHybrid, ErnieForPretrainingPipe, ErnieForSequenceClassificationHybrid) from ppfleetx.models.language_model.utils import process_configs import numpy as np def process_data_configs(config): """ process data configs for hybrid parallel """ cfg_global = config['Global'] cfg_data = config['Data'] mode_to_num_samples = { "Train": cfg_global['global_batch_size'] * config['Engine']['max_steps'], "Eval": cfg_global['global_batch_size'] * (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) * config['Engine']['eval_iters'], "Test": cfg_global['global_batch_size'] * config['Engine']['test_iters'], } for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[ mode] cfg_data[mode]['dataset']['mode'] = mode cfg_data[mode]['dataset']['seed'] = cfg_global['seed'] cfg_data[mode]['sampler']['batch_size'] = cfg_global[ 'local_batch_size'] cfg_data[mode]['dataset'].setdefault('binary_head', cfg_global['binary_head']) cfg_data[mode]['loader']['collate_fn'].setdefault( 'micro_batch_size', cfg_global['micro_batch_size']) def process_model_configs(config): cfg_model = config['Model'] hidden_size = cfg_model['hidden_size'] cfg_model.setdefault("intermediate_size", hidden_size * 4) def process_finetune_configs(task, config): cfg_data = config['Data'] cfg_dist = config['Distributed'] cfg_optim = config['Optimizer'] cfg_global = config['Global'] cfg_engine = config['Engine'] path = "./ppfleetx/models/language_model/ernie/finetune_configs.yaml" with codecs.open(path, 'r', 'utf-8') as file: dic = yaml.load(file, Loader=yaml.FullLoader) dataset_type = cfg_data.Train.dataset.dataset_type assert dataset_type in dic[task].keys( ), "{} is an invalid dataset type ! Only support the types of dataset shown in {}".format( dataset_type, path) num_train_epochs = dic[task][dataset_type].get('num_train_epochs', None) if num_train_epochs is not None: cfg_engine['num_train_epochs'] = num_train_epochs learning_rate = dic[task][dataset_type].get("learning_rate", None) if learning_rate is not None: cfg_optim['lr']['max_lr'] = learning_rate max_seq_length = dic[task][dataset_type].get("max_seq_length", None) if max_seq_length is not None: for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['max_seq_len'] = max_seq_length batch_size = dic[task][dataset_type].get("batch_size", None) if batch_size is not None: assert batch_size % cfg_global['micro_batch_size'] == 0 cfg_global['local_batch_size'] = batch_size cfg_global['global_batch_size'] = batch_size * cfg_dist[ 'dp_degree'] * cfg_dist['pp_degree'] class ErnieModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() super(ErnieModule, self).__init__(configs) self.nranks = paddle.distributed.get_world_size() self.binary_head = self.configs['Global']['binary_head'] if self.nranks > 1: self.criterion = ErniePretrainingCriterionHybrid(self.binary_head) else: self.criterion = ErniePretrainingCriterion(self.binary_head) def get_model_size(self, l, h, v, s): P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h)) logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) def process_configs(self, configs): process_data_configs(configs) process_model_configs(configs) return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") l = model_setting['num_hidden_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_length self.get_model_size(l, h, v, s) if self.nranks > 1: model_setting[ 'num_partitions'] = self.configs.Distributed.mp_degree # model = ErnieForPretrainingHybrid(ErnieModelHybrid(**model_setting)) if self.configs.Distributed.pp_degree == 1: model = ErnieForPretrainingHybrid( ErnieModelHybrid(**model_setting)) else: model = ErnieForPretrainingPipe(**model_setting) else: model = ErnieForPretraining(ErnieModel(**model_setting)) return model def forward(self, tokens): return self.model(tokens) def pretreating_batch(self, batch): if self.configs.Distributed.pp_degree > 1: input_ids, segment_ids, input_mask, masked_lm_positions, \ masked_lm_labels, next_sentence_labels = batch if not isinstance(masked_lm_positions, list): masked_lm_positions = [masked_lm_positions] if not isinstance(masked_lm_labels, list): masked_lm_labels = [masked_lm_labels] data = [ (input_ids, segment_ids, input_mask), (masked_lm_positions, masked_lm_labels, next_sentence_labels) ] return data else: return batch def training_step(self, batch): input_ids, segment_ids, input_mask, masked_lm_positions, \ masked_lm_labels, next_sentence_labels = batch # Create the model for the ernie pretrain if self.binary_head: prediction_scores, seq_relationship_score = self.model( input_ids=input_ids, token_type_ids=segment_ids, # position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) lm_loss, sop_loss = self.criterion( prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss else: prediction_scores = self.model( input_ids=input_ids, token_type_ids=segment_ids, # position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = self.criterion(prediction_scores, None, masked_lm_labels) return loss def training_step_end(self, log_dict): speed = 1. / log_dict['train_cost'] default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_seq_length logger.info( "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.nranks, log_dict['lr'])) def input_spec(self): return [ InputSpec( shape=[None, None], dtype='int64'), InputSpec( shape=[None, None], dtype='int64'), InputSpec( shape=[None, None], dtype='int64') ] class ErnieSeqClsModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() super(ErnieSeqClsModule, self).__init__(configs) self.criterion = nn.loss.CrossEntropyLoss( ) # if data_args.label_list else nn.loss.MSELoss() self.past_index = -1 self.past = None self.label_names = (["start_positions", "end_positions"] \ if "QusetionAnswering" in type(self.model).__name__ else ["labels"]) def process_configs(self, configs): process_model_configs(configs) process_finetune_configs("SequenceClassification", configs) cfg_global = configs['Global'] cfg_data = configs['Data'] for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['mode'] = mode cfg_data[mode]['sampler']['batch_size'] = cfg_global[ 'local_batch_size'] cfg_data[mode]['loader']['collate_fn'].setdefault( 'tokenizer_type', cfg_data[mode]['dataset']['tokenizer_type']) return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") if self.nranks > 1: model_setting[ 'num_partitions'] = self.configs.Distributed.mp_degree if self.configs.Distributed.pp_degree == 1: model = ErnieForSequenceClassificationHybrid( ErnieModelHybrid(**model_setting)) else: raise ValueError( "Pipeline Parallelism is not supported in Sequence \ Classification task of Ernie model.") else: model = ErnieForSequenceClassification(ErnieModel(**model_setting)) return model def prepare_input(self, data): """ Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors. """ if isinstance(data, Mapping): return type(data)( {k: self.prepare_input(v) for k, v in data.items()}) elif isinstance(data, (tuple, list)): return type(data)(self.prepare_input(v) for v in data) elif isinstance(data, paddle.Tensor): # kwargs = dict(device=self.args.current_device) # update data type for pure fp16 return data # return data.to(**kwargs) return data def pretreating_batch(self, batch): self.has_labels = all( batch.get(k) is not None for k in self.label_names) batch = self.prepare_input(batch) if self.past_index >= 0 and self.past is not None: batch["mems"] = self.past return batch def forward(self, inputs): return self.model(**inputs) def compute_loss(self, inputs, return_outputs=False): if "labels" in inputs: labels = inputs.pop("labels") elif "start_positions" in inputs and "end_positions" in inputs: labels = (inputs.pop("start_positions"), inputs.pop("end_positions")) elif "generator_labels" in inputs: labels = inputs["generator_labels"] else: labels = None outputs = self(inputs) loss = self.criterion(outputs, labels) outputs = (loss, outputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. if self.past_index >= 0: self.past = outputs[self.args.past_index] # We don't use .loss here since the model may return tuples instead of ModelOutput. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] return (loss, outputs) if return_outputs else loss def training_step(self, batch): return self.compute_loss(batch) def training_step_end(self, log_dict): speed = 1. / log_dict['train_cost'] default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_seq_len logger.info( "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.nranks, log_dict['lr'])) def input_spec(self): input_spec = [ paddle.static.InputSpec( shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec( shape=[None, None], dtype="int64") # segment_ids ] return input_spec def validation_step(self, inputs): if self.has_labels: loss, outputs = self.compute_loss(inputs, return_outputs=True) loss = loss.mean().detach() else: loss = None return loss def validation_step_end(self, log_dict): speed = 1. / log_dict['eval_cost'] logger.info( "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['eval_cost'], speed)) ================================================ FILE: ppfleetx/models/language_model/ernie/finetune_configs.yaml ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Datasets which used for sequence classfication SequenceClassification: clue afqmc: num_train_epochs: 4 clue tnews: num_train_epochs: 4 clue iflytek: num_train_epochs: 8 clue ocnli: num_train_epochs: 8 clue cmnli: num_train_epochs: 3 clue wsc: num_train_epochs: 50 clue csl: num_train_epochs: 10 max_seq_length: 256 batch_size: 32 xnli_cn: learning_rate: 0.0001 num_train_epochs: 3 batch_size: 256 chnsenticorp_v2: learning_rate: 0.00005 batch_size: 16 num_train_epochs: 8 ================================================ FILE: ppfleetx/models/language_model/ernie/layers/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/ernie/layers/distributed_transformer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # TODO: define the classes of Transformer neural network import copy import collections import numpy as np import paddle import paddle.nn.functional as F import paddle.nn as nn from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer import paddle.tensor as tensor from paddle.fluid import layers from paddle import ParamAttr from paddle.fluid.data_feeder import convert_dtype from .model_outputs import BaseModelOutputWithPastAndCrossAttentions from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc from paddle.distributed.fleet.utils import recompute __all__ = [] def _convert_param_attr_to_list(param_attr, n): """ If `param_attr` is a list or tuple, convert every element in it to a ParamAttr instance. Otherwise, repeat `param_attr` `n` times to construct a list, and rename every one by appending a increasing index suffix to avoid having same names when `param_attr` contains a name. Parameters: param_attr (list|tuple|ParamAttr): A list, tuple or something can be converted to a ParamAttr instance by `ParamAttr._to_attr`. n (int): The times to repeat to construct a list when `param_attr` is not a list or tuple. Returns: list: A list composed of each including cell's `param_attr`. """ if isinstance(param_attr, (list, tuple)): assert len(param_attr) == n, ( "length of param_attr should be %d when it is a list/tuple" % n) param_attrs = [] for attr in param_attr: if isinstance(attr, bool): if attr: param_attrs.append(ParamAttr._to_attr(None)) else: param_attrs.append(False) else: param_attrs.append(ParamAttr._to_attr(attr)) # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] elif isinstance(param_attr, bool): param_attrs = [] if param_attr: param_attrs = [ParamAttr._to_attr(None) for i in range(n)] else: param_attrs = [False] * n else: param_attrs = [] attr = ParamAttr._to_attr(param_attr) for i in range(n): attr_i = copy.deepcopy(attr) if attr.name: attr_i.name = attr_i.name + "_" + str(i) param_attrs.append(attr_i) return param_attrs def _convert_attention_mask(attn_mask, dtype): """ Convert the attention mask to the target dtype we expect. Parameters: attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. dtype (VarType): The target type of `attn_mask` we expect. Returns: Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. """ if attn_mask is not None and attn_mask.dtype != dtype: attn_mask_dtype = convert_dtype(attn_mask.dtype) if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype: attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9 else: attn_mask = paddle.cast(attn_mask, dtype) return attn_mask class MultiHeadAttention(Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. Please refer to `Attention Is All You Need `_ for more details. Parameters: embed_dim (int): The expected feature size in the input and output. num_heads (int): The number of heads in multi-head attention. dropout (float, optional): The dropout probability used on attention weights to drop some attention targets. 0 for no dropout. Default 0 kdim (int, optional): The feature size in key. If None, assumed equal to `embed_dim`. Default None. vdim (int, optional): The feature size in value. If None, assumed equal to `embed_dim`. Default None. need_weights (bool, optional): Indicate whether to return the attention weights. Default False. weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. Default: None, which means the default bias parameter property is used. If it is set to False, this layer will not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Examples: .. code-block:: python import paddle # encoder input: [batch_size, sequence_length, d_model] query = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, num_heads, query_len, query_len] attn_mask = paddle.rand((2, 2, 4, 4)) multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] """ Cache = collections.namedtuple("Cache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None, num_partitions=1): super(MultiHeadAttention, self).__init__() assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim)) assert num_heads > 0, ("Expected num_heads to be greater than 0, " "but received {}".format(num_heads)) self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" assert self.num_heads % num_partitions == 0 self.num_heads = self.num_heads // num_partitions # self.q_proj = Linear( # embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) # self.k_proj = Linear( # self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) # self.v_proj = Linear( # self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) # self.out_proj = Linear( # embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) self.q_proj = fleet.meta_parallel.ColumnParallelLinear( embed_dim, embed_dim, weight_attr=weight_attr, has_bias=True, gather_output=False) self.k_proj = fleet.meta_parallel.ColumnParallelLinear( self.kdim, embed_dim, weight_attr=weight_attr, has_bias=True, gather_output=False) self.v_proj = fleet.meta_parallel.ColumnParallelLinear( self.vdim, embed_dim, weight_attr=weight_attr, has_bias=True, gather_output=False) self.out_proj = fleet.meta_parallel.RowParallelLinear( embed_dim, embed_dim, weight_attr=weight_attr, has_bias=True, input_is_parallel=True) def _prepare_qkv(self, query, key, value, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and `v` fields would be used as calculated results on `key` and `value`, which mostly used for decoder-encoder cross attention. It is only used for inference and should be None for training. Default None. Returns: tuple: A tuple including linear projected keys and values. These two \ tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ and `[batch_size, n_head, sequence_length, d_value]` separately, \ and their data types are same as inputs. """ q = self.q_proj(query.clone()) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key.clone(), value.clone()) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=2) v = tensor.concat([cache.v, v], axis=2) cache = self.Cache(k, v) return (q, k, v) if cache is None else (q, k, v, cache) def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. Parameters: key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, kdim]`. The data type should be float32 or float64. value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, vdim]`. The data type should be float32 or float64. Returns: tuple: A tuple including transformed keys and values. Their shapes \ both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \ and their data types are same as inputs. """ k = self.k_proj(key) v = self.v_proj(value) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) return k, v def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields, and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If the generated cache is an instance of `Cache`, `k` and `v` fields reserve intermediate result tensors of previous positions, and the tensors are incremental among decoding steps, which mostly are used for decoder decoder self attention. If the generated cache is an instance of `StaticCache`, `k` and `v` fields would be used as calculated result tensors on keys an values in `forward`, and the tensors keep unchanged among decoding steps, which are mostly used for decoder-encoder cross attention. The cache is generated as follows: 1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the results to create an instance of `StaticCache`. 2. If `type` is `Cache` and `value` is None, generate empty tensors shaped `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results to create an instance of `Cache`, where `batch_size` is from the first dimension of `key`. 3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create an instance of `Cache`. Parameters: key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If `value` is None, it is only for batch size and data type reference. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, `key` is only for batch size reference. Default None. type (type): It should be `MultiHeadAttention.StaticCache` or `MultiHeadAttention.Cache` to indicate the cache type to generate. Returns: namedtuple: an instance of `Cache` or `StaticCache` accordingly. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) def forward(self, query, key=None, value=None, attn_mask=None, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor, optional): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If it is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and `v` fields would be used as calculated results on `key` and `value`, which mostly used for decoder-encoder cross attention. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. Or a tuple if \ `need_weights` is True or `cache` is not None. If `need_weights` \ is True, except for attention output, the tuple also includes \ the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ If `cache` is not None, the tuple then includes the new cache \ having the same type as `cache`, and if it is `StaticCache`, it \ is same as the input `cache`, if it is `Cache`, the new cache \ reserves tensors concatanating raw tensors with intermediate \ results of current query. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if cache is None: q, k, v = self._prepare_qkv(query, key, value, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, cache) # scale dot product attention product = paddle.matmul( x=q * (self.head_dim**-0.5), y=k, transpose_y=True) if attn_mask is not None: # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, product.dtype) product = product + attn_mask weights = F.softmax(product) if self.dropout: with get_rng_state_tracker().rng_state('local_seed'): weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if cache is not None: outs.append(cache) return out if len(outs) == 1 else tuple(outs) class TransformerEncoderLayer(Layer): """ TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) attention and feedforward network. Before and after each sub-layer, pre-process and post-precess would be applied on the input and output accordingly. If `normalize_before` is True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Parameters: d_model (int): The expected feature size in the input and output. nhead (int): The number of heads in multi-head attention(MHA). dim_feedforward (int): The hidden layer size in the feedforward network(FFN). dropout (float, optional): The dropout probability used in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 activation (str, optional): The activation function in the feedforward network. Default relu. attn_dropout (float, optional): The dropout probability used in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN activition. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property. If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property. If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. The `False` value means the corresponding layer would not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Default: None, which means the default bias parameter property is used. Examples: .. code-block:: python import paddle from paddle.nn import TransformerEncoderLayer # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = TransformerEncoderLayer(128, 2, 512) enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False, weight_attr=None, bias_attr=None, num_partitions=1): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerEncoderLayer, self).__init__() assert d_model > 0, ("Expected d_model to be greater than 0, " "but received {}".format(d_model)) assert nhead > 0, ("Expected nhead to be greater than 0, " "but received {}".format(nhead)) assert dim_feedforward > 0, ( "Expected dim_feedforward to be greater than 0, " "but received {}".format(dim_feedforward)) attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], num_partitions=num_partitions) # self.linear1 = Linear( # d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1]) self.dropout = Dropout(act_dropout, mode="upscale_in_train") # self.linear2 = Linear( # dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self.linear1 = fleet.meta_parallel.ColumnParallelLinear( d_model, dim_feedforward, weight_attr=weight_attrs[1], gather_output=False, has_bias=True) self.linear2 = fleet.meta_parallel.RowParallelLinear( dim_feedforward, d_model, weight_attr=weight_attrs[1], input_is_parallel=True, has_bias=True) def forward(self, src, src_mask=None, cache=None, output_attentions=False): r""" Applies a Transformer encoder layer on the input. Parameters: src (Tensor): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. See `TransformerEncoderLayer.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `enc_input`, representing the output of Transformer encoder \ layer. Or a tuple if `cache` is not None, except for encoder \ layer output, the tuple includes the new cache which is same \ as input `cache` argument but `incremental_cache` has an \ incremental length. See `MultiHeadAttention.gen_cache` and \ `MultiHeadAttention.forward` for more details. """ self.self_attn.need_weights = output_attentions src_mask = _convert_attention_mask(src_mask, src.dtype) residual = src if self.normalize_before: src = self.norm1(src) attn_outputs = self.self_attn(src, src, src, src_mask, cache) if isinstance(attn_outputs, tuple): src = attn_outputs[0] outputs = attn_outputs[1:] else: src = attn_outputs outputs = None src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) with get_rng_state_tracker().rng_state('global_seed'): tgt = self.dropout(self.activation(self.linear1(src))) # tgt = residual + self.dropout1(tgt) src = self.linear2(tgt) with get_rng_state_tracker().rng_state('global_seed'): src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src if outputs is None else ( (src, ) + outputs[::-1]) # hidden_states, cache, attentions def gen_cache(self, src): r""" Generates cache for `forward` usage. The generated cache is an instance of `MultiHeadAttention.Cache`. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. Returns: incremental_cache: It is an instance of `MultiHeadAttention.Cache` \ produced by `self_attn.gen_cache`, it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. See \ `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ incremental_cache = self.self_attn.gen_cache( src, type=self.self_attn.Cache) return incremental_cache class TransformerEncoder(Layer): """ TransformerEncoder is a stack of N encoder layers. Parameters: encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It would be used as the first layer, and the other layers would be created according to the configurations of it. num_layers (int): The number of encoder layers to be stacked. norm (LayerNorm, optional): the layer normalization component. If provided, apply layer normalization on the output of last encoder layer. Examples: .. code-block:: python import paddle from paddle.nn import TransformerEncoderLayer, TransformerEncoder # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = TransformerEncoderLayer(128, 2, 512) encoder = TransformerEncoder(encoder_layer, 2) enc_output = encoder(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, encoder_layer, num_layers, norm=None, enable_recompute=False): super(TransformerEncoder, self).__init__() self.layers = LayerList([(encoder_layer if i == 0 else type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm self.enable_recompute = enable_recompute def forward(self, src, src_mask=None, cache=None, output_attentions=False, output_hidden_states=False, return_dict=False): r""" Applies a stack of N Transformer encoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last encoder layer. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (list, optional): It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoder.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `src`, representing the output of Transformer encoder. \ Or a tuple if `cache` is not None, except for encoder output, \ the tuple includes the new cache which is same as input `cache` \ argument but `incremental_cache` in it has an incremental length. \ See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ src_mask = _convert_attention_mask(src_mask, src.dtype) output = src # To get cache from None when use_cache is True, which is compatible with HF # while HF requires decoder. The implementation here uses cache update in the # MultiHeadAttention not so efficiently, and maybe optimize it later. if cache is None and getattr(self, "_use_cache", False): cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers) # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts # to True when cache is not None. new_caches = [] if cache is not None and getattr(self, "_use_cache", True) else None all_attentions = [] if output_attentions else None # NOTE: Also includes embeding output which is same as HF. all_hidden_states = [output] if output_hidden_states else None for i, mod in enumerate(self.layers): if self.enable_recompute: # Note: recompute do not support pass as **kwargs yet. layer_outputs = recompute( mod, output, src_mask, None if cache is None else cache[i] if isinstance(cache[i], MultiHeadAttention.Cache) else MultiHeadAttention.Cache(*cache[i]), output_attentions) else: layer_outputs = mod( output, src_mask=src_mask, cache=None if cache is None else cache[i] if isinstance(cache[i], MultiHeadAttention.Cache) else MultiHeadAttention.Cache(*cache[i]), output_attentions=output_attentions) if isinstance(layer_outputs, tuple): output = layer_outputs[0] outputs = layer_outputs[1:] else: output = layer_outputs outputs = None if output_hidden_states: all_hidden_states.append(output) if output_attentions: all_attentions.append(outputs[-1]) if new_caches is not None: new_caches.append(outputs[0] if isinstance(cache[ i], MultiHeadAttention.Cache) else (tuple(outputs[0]))) if self.norm is not None: output = self.norm(output) if output_hidden_states: all_hidden_states[-1] = output if not return_dict: outputs = tuple( tuple(v) if isinstance(v, list) else v for v in [ output, new_caches, all_hidden_states, all_attentions, ] if v is not None) if len(outputs) == 1: return output else: return outputs return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=output, past_key_values=new_caches, hidden_states=all_hidden_states, attentions=all_attentions) def gen_cache(self, src): r""" Generates cache for `forward` usage. The generated cache is a list, and each element in it is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache` for more details. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. Returns: list: It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache` for more details. """ cache = [layer.gen_cache(src) for layer in self.layers] return cache ================================================ FILE: ppfleetx/models/language_model/ernie/layers/model_outputs.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import functools import paddle import numpy as np from collections import OrderedDict from dataclasses import fields, dataclass from typing import Any, List, Tuple, Optional from paddle.nn.layer.transformer import _convert_attention_mask, MultiHeadAttention from paddle.distributed.fleet.utils import recompute from .utils import adapt_stale_fwd_patch def is_tensor(x): if isinstance(x, paddle.Tensor): return True return isinstance(x, np.ndarray) class ModelOutput(OrderedDict): """ Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular python dictionary. You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple before. """ def __post_init__(self): class_fields = fields(self) # note(guosheng): Convert list to tuple automatically, and better to # check if it is frozen. # assert not getattr(self, dataclasses._PARAMS).frozen for f in class_fields: value = getattr(self, f.name) if isinstance(value, list): setattr(self, f.name, tuple(value)) # Safety and consistency checks if not len(class_fields): raise ValueError(f"{self.__class__.__name__} has no fields.") if not all(field.default is None for field in class_fields[1:]): raise ValueError( f"{self.__class__.__name__} should not have more than one required field." ) first_field = getattr(self, class_fields[0].name) other_fields_are_none = all( getattr(self, field.name) is None for field in class_fields[1:]) if other_fields_are_none and not is_tensor(first_field): if isinstance(first_field, dict): iterator = first_field.items() first_field_iterator = True else: try: iterator = iter(first_field) first_field_iterator = True except TypeError: first_field_iterator = False # if we provided an iterator as first field and the iterator is a (key, value) iterator # set the associated fields if first_field_iterator: for element in iterator: if (not isinstance(element, (list, tuple)) or not len(element) == 2 or not isinstance(element[0], str)): break setattr(self, element[0], element[1]) if element[1] is not None: self[element[0]] = element[1] elif first_field is not None: self[class_fields[0].name] = first_field else: for field in class_fields: v = getattr(self, field.name) if v is not None: self[field.name] = v def __delitem__(self, *args, **kwargs): raise Exception( f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance." ) def setdefault(self, *args, **kwargs): raise Exception( f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance." ) def pop(self, *args, **kwargs): raise Exception( f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") def update(self, *args, **kwargs): raise Exception( f"You cannot use ``update`` on a {self.__class__.__name__} instance." ) def __getitem__(self, k): if isinstance(k, str): inner_dict = {k: v for (k, v) in self.items()} return inner_dict[k] else: return self.to_tuple()[k] def __setattr__(self, name, value): if name in self.keys() and value is not None: # Don't call self.__setitem__ to avoid recursion errors super().__setitem__(name, value) super().__setattr__(name, value) def __setitem__(self, key, value): # Will raise a KeyException if needed super().__setitem__(key, value) # Don't call self.__setattr__ to avoid recursion errors super().__setattr__(key, value) def to_tuple(self) -> Tuple[Any]: """ Convert self to a tuple containing all the attributes/keys that are not `None`. """ return tuple(self[k] for k in self.keys()) @dataclass class ErnieForPreTrainingOutput(ModelOutput): """ Output type of [`ErnieForPreTraining`]. Args: loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`): Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss = None prediction_logits = None seq_relationship_logits = None hidden_states = None attentions = None @dataclass class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): """ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Args: last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. """ last_hidden_state: paddle.Tensor = None past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None cross_attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): """ Base class for model's outputs that also contains a pooling of the last hidden states. Args: last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns the classification token after processing through a linear layer and a tanh activation function. The linear layer weights are trained from the next sentence prediction (classification) objective during pretraining. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. """ last_hidden_state: paddle.Tensor = None pooler_output: paddle.Tensor = None past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None cross_attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class SequenceClassifierOutput(ModelOutput): """ Base class for outputs of sentence classification models. Args: loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Classification (or regression if config.num_labels==1) loss. logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[paddle.Tensor] = None logits: paddle.Tensor = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class TokenClassifierOutput(ModelOutput): """ Base class for outputs of token classification models. Args: loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) : Classification loss. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[paddle.Tensor] = None logits: paddle.Tensor = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class QuestionAnsweringModelOutput(ModelOutput): """ Base class for outputs of question answering models. Args: loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`): Span-start scores (before SoftMax). end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`): Span-end scores (before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[paddle.Tensor] = None start_logits: paddle.Tensor = None end_logits: paddle.Tensor = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class MultipleChoiceModelOutput(ModelOutput): """ Base class for outputs of multiple choice models. Args: loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided): Classification loss. logits (`paddle.Tensor` of shape `(batch_size, num_choices)`): *num_choices* is the second dimension of the input tensors. (see *input_ids* above). Classification scores (before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[paddle.Tensor] = None logits: paddle.Tensor = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class MaskedLMOutput(ModelOutput): """ Base class for masked language models outputs. Args: loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Masked language modeling (MLM) loss. logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[paddle.Tensor] = None logits: paddle.Tensor = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None @dataclass class CausalLMOutputWithCrossAttentions(ModelOutput): """ Base class for causal language model (or autoregressive) outputs. Args: loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Language modeling loss (for next-token prediction). logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Cross attentions weights after the attention softmax, used to compute the weighted average in the cross-attention heads. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key, value states of the self-attention and the cross-attention layers if model is used in encoder-decoder setting. Only relevant if `config.is_decoder = True`. Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. """ loss: Optional[paddle.Tensor] = None logits: paddle.Tensor = None past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None hidden_states: Optional[Tuple[paddle.Tensor]] = None attentions: Optional[Tuple[paddle.Tensor]] = None cross_attentions: Optional[Tuple[paddle.Tensor]] = None ================================================ FILE: ppfleetx/models/language_model/ernie/layers/transformer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # TODO: define the classes of Transformer neural network import copy import collections import numpy as np import paddle import paddle.nn.functional as F import paddle.nn as nn from paddle.nn import Linear, Dropout, LayerNorm, LayerList, Layer import paddle.tensor as tensor from paddle.fluid import layers from paddle import ParamAttr from paddle.fluid.data_feeder import convert_dtype from .model_outputs import BaseModelOutputWithPastAndCrossAttentions from paddle.distributed.fleet.utils import recompute __all__ = [] def _convert_param_attr_to_list(param_attr, n): """ If `param_attr` is a list or tuple, convert every element in it to a ParamAttr instance. Otherwise, repeat `param_attr` `n` times to construct a list, and rename every one by appending a increasing index suffix to avoid having same names when `param_attr` contains a name. Parameters: param_attr (list|tuple|ParamAttr): A list, tuple or something can be converted to a ParamAttr instance by `ParamAttr._to_attr`. n (int): The times to repeat to construct a list when `param_attr` is not a list or tuple. Returns: list: A list composed of each including cell's `param_attr`. """ if isinstance(param_attr, (list, tuple)): assert len(param_attr) == n, ( "length of param_attr should be %d when it is a list/tuple" % n) param_attrs = [] for attr in param_attr: if isinstance(attr, bool): if attr: param_attrs.append(ParamAttr._to_attr(None)) else: param_attrs.append(False) else: param_attrs.append(ParamAttr._to_attr(attr)) # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] elif isinstance(param_attr, bool): param_attrs = [] if param_attr: param_attrs = [ParamAttr._to_attr(None) for i in range(n)] else: param_attrs = [False] * n else: param_attrs = [] attr = ParamAttr._to_attr(param_attr) for i in range(n): attr_i = copy.deepcopy(attr) if attr.name: attr_i.name = attr_i.name + "_" + str(i) param_attrs.append(attr_i) return param_attrs def _convert_attention_mask(attn_mask, dtype): """ Convert the attention mask to the target dtype we expect. Parameters: attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. dtype (VarType): The target type of `attn_mask` we expect. Returns: Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. """ if attn_mask is not None and attn_mask.dtype != dtype: attn_mask_dtype = convert_dtype(attn_mask.dtype) if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype: attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9 else: attn_mask = paddle.cast(attn_mask, dtype) return attn_mask class MultiHeadAttention(Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. Please refer to `Attention Is All You Need `_ for more details. Parameters: embed_dim (int): The expected feature size in the input and output. num_heads (int): The number of heads in multi-head attention. dropout (float, optional): The dropout probability used on attention weights to drop some attention targets. 0 for no dropout. Default 0 kdim (int, optional): The feature size in key. If None, assumed equal to `embed_dim`. Default None. vdim (int, optional): The feature size in value. If None, assumed equal to `embed_dim`. Default None. need_weights (bool, optional): Indicate whether to return the attention weights. Default False. weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. Default: None, which means the default bias parameter property is used. If it is set to False, this layer will not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Examples: .. code-block:: python import paddle # encoder input: [batch_size, sequence_length, d_model] query = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, num_heads, query_len, query_len] attn_mask = paddle.rand((2, 2, 4, 4)) multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] """ Cache = collections.namedtuple("Cache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None): super(MultiHeadAttention, self).__init__() assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim)) assert num_heads > 0, ("Expected num_heads to be greater than 0, " "but received {}".format(num_heads)) self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.q_proj = Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) self.k_proj = Linear( self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) self.v_proj = Linear( self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) self.out_proj = Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) def _prepare_qkv(self, query, key, value, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and `v` fields would be used as calculated results on `key` and `value`, which mostly used for decoder-encoder cross attention. It is only used for inference and should be None for training. Default None. Returns: tuple: A tuple including linear projected keys and values. These two \ tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ and `[batch_size, n_head, sequence_length, d_value]` separately, \ and their data types are same as inputs. """ q = self.q_proj(query) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=2) v = tensor.concat([cache.v, v], axis=2) cache = self.Cache(k, v) return (q, k, v) if cache is None else (q, k, v, cache) def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. Parameters: key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, kdim]`. The data type should be float32 or float64. value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, vdim]`. The data type should be float32 or float64. Returns: tuple: A tuple including transformed keys and values. Their shapes \ both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \ and their data types are same as inputs. """ k = self.k_proj(key) v = self.v_proj(value) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) return k, v def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields, and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If the generated cache is an instance of `Cache`, `k` and `v` fields reserve intermediate result tensors of previous positions, and the tensors are incremental among decoding steps, which mostly are used for decoder decoder self attention. If the generated cache is an instance of `StaticCache`, `k` and `v` fields would be used as calculated result tensors on keys an values in `forward`, and the tensors keep unchanged among decoding steps, which are mostly used for decoder-encoder cross attention. The cache is generated as follows: 1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the results to create an instance of `StaticCache`. 2. If `type` is `Cache` and `value` is None, generate empty tensors shaped `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results to create an instance of `Cache`, where `batch_size` is from the first dimension of `key`. 3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create an instance of `Cache`. Parameters: key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If `value` is None, it is only for batch size and data type reference. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, `key` is only for batch size reference. Default None. type (type): It should be `MultiHeadAttention.StaticCache` or `MultiHeadAttention.Cache` to indicate the cache type to generate. Returns: namedtuple: an instance of `Cache` or `StaticCache` accordingly. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) def forward(self, query, key=None, value=None, attn_mask=None, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor, optional): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in MultiHeadAttention. If it is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and `v` fields would be used as calculated results on `key` and `value`, which mostly used for decoder-encoder cross attention. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. Or a tuple if \ `need_weights` is True or `cache` is not None. If `need_weights` \ is True, except for attention output, the tuple also includes \ the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ If `cache` is not None, the tuple then includes the new cache \ having the same type as `cache`, and if it is `StaticCache`, it \ is same as the input `cache`, if it is `Cache`, the new cache \ reserves tensors concatanating raw tensors with intermediate \ results of current query. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if cache is None: q, k, v = self._prepare_qkv(query, key, value, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, cache) # scale dot product attention product = paddle.matmul( x=q * (self.head_dim**-0.5), y=k, transpose_y=True) if attn_mask is not None: # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, product.dtype) product = product + attn_mask weights = F.softmax(product) if self.dropout: weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if cache is not None: outs.append(cache) return out if len(outs) == 1 else tuple(outs) class TransformerEncoderLayer(Layer): """ TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) attention and feedforward network. Before and after each sub-layer, pre-process and post-precess would be applied on the input and output accordingly. If `normalize_before` is True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Parameters: d_model (int): The expected feature size in the input and output. nhead (int): The number of heads in multi-head attention(MHA). dim_feedforward (int): The hidden layer size in the feedforward network(FFN). dropout (float, optional): The dropout probability used in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 activation (str, optional): The activation function in the feedforward network. Default relu. attn_dropout (float, optional): The dropout probability used in MHA to drop some attention target. If None, use the value of `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN activition. If None, use the value of `dropout`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property. If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property. If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. The `False` value means the corresponding layer would not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Default: None, which means the default bias parameter property is used. Examples: .. code-block:: python import paddle from paddle.nn import TransformerEncoderLayer # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = TransformerEncoderLayer(128, 2, 512) enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False, weight_attr=None, bias_attr=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerEncoderLayer, self).__init__() assert d_model > 0, ("Expected d_model to be greater than 0, " "but received {}".format(d_model)) assert nhead > 0, ("Expected nhead to be greater than 0, " "but received {}".format(nhead)) assert dim_feedforward > 0, ( "Expected dim_feedforward to be greater than 0, " "but received {}".format(dim_feedforward)) attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) self.linear1 = Linear( d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1]) self.dropout = Dropout(act_dropout, mode="upscale_in_train") self.linear2 = Linear( dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) def forward(self, src, src_mask=None, cache=None, output_attentions=False): r""" Applies a Transformer encoder layer on the input. Parameters: src (Tensor): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. See `TransformerEncoderLayer.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `enc_input`, representing the output of Transformer encoder \ layer. Or a tuple if `cache` is not None, except for encoder \ layer output, the tuple includes the new cache which is same \ as input `cache` argument but `incremental_cache` has an \ incremental length. See `MultiHeadAttention.gen_cache` and \ `MultiHeadAttention.forward` for more details. """ self.self_attn.need_weights = output_attentions src_mask = _convert_attention_mask(src_mask, src.dtype) residual = src if self.normalize_before: src = self.norm1(src) attn_outputs = self.self_attn(src, src, src, src_mask, cache) if isinstance(attn_outputs, tuple): src = attn_outputs[0] outputs = attn_outputs[1:] else: src = attn_outputs outputs = None src = residual + self.dropout1(src) if not self.normalize_before: src = self.norm1(src) residual = src if self.normalize_before: src = self.norm2(src) src = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = residual + self.dropout2(src) if not self.normalize_before: src = self.norm2(src) return src if outputs is None else ( (src, ) + outputs[::-1]) # hidden_states, cache, attentions def gen_cache(self, src): r""" Generates cache for `forward` usage. The generated cache is an instance of `MultiHeadAttention.Cache`. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. Returns: incremental_cache: It is an instance of `MultiHeadAttention.Cache` \ produced by `self_attn.gen_cache`, it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. See \ `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ incremental_cache = self.self_attn.gen_cache( src, type=self.self_attn.Cache) return incremental_cache class TransformerEncoder(Layer): """ TransformerEncoder is a stack of N encoder layers. Parameters: encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It would be used as the first layer, and the other layers would be created according to the configurations of it. num_layers (int): The number of encoder layers to be stacked. norm (LayerNorm, optional): the layer normalization component. If provided, apply layer normalization on the output of last encoder layer. Examples: .. code-block:: python import paddle from paddle.nn import TransformerEncoderLayer, TransformerEncoder # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = TransformerEncoderLayer(128, 2, 512) encoder = TransformerEncoder(encoder_layer, 2) enc_output = encoder(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, encoder_layer, num_layers, norm=None, enable_recompute=False): super(TransformerEncoder, self).__init__() self.layers = LayerList([(encoder_layer if i == 0 else type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm self.enable_recompute = enable_recompute def forward(self, src, src_mask=None, cache=None, output_attentions=False, output_hidden_states=False, return_dict=False): r""" Applies a stack of N Transformer encoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last encoder layer. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (list, optional): It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoder.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `src`, representing the output of Transformer encoder. \ Or a tuple if `cache` is not None, except for encoder output, \ the tuple includes the new cache which is same as input `cache` \ argument but `incremental_cache` in it has an incremental length. \ See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ src_mask = _convert_attention_mask(src_mask, src.dtype) output = src # To get cache from None when use_cache is True, which is compatible with HF # while HF requires decoder. The implementation here uses cache update in the # MultiHeadAttention not so efficiently, and maybe optimize it later. if cache is None and getattr(self, "_use_cache", False): cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers) # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts # to True when cache is not None. new_caches = [] if cache is not None and getattr(self, "_use_cache", True) else None all_attentions = [] if output_attentions else None # NOTE: Also includes embeding output which is same as HF. all_hidden_states = [output] if output_hidden_states else None for i, mod in enumerate(self.layers): if self.enable_recompute: # Note: recompute do not support pass as **kwargs yet. layer_outputs = recompute( mod, output, src_mask, None if cache is None else cache[i] if isinstance(cache[i], MultiHeadAttention.Cache) else MultiHeadAttention.Cache(*cache[i]), output_attentions) else: layer_outputs = mod( output, src_mask=src_mask, cache=None if cache is None else cache[i] if isinstance(cache[i], MultiHeadAttention.Cache) else MultiHeadAttention.Cache(*cache[i]), output_attentions=output_attentions) if isinstance(layer_outputs, tuple): output = layer_outputs[0] outputs = layer_outputs[1:] else: output = layer_outputs outputs = None if output_hidden_states: all_hidden_states.append(output) if output_attentions: all_attentions.append(outputs[-1]) if new_caches is not None: new_caches.append(outputs[0] if isinstance(cache[ i], MultiHeadAttention.Cache) else (tuple(outputs[0]))) if self.norm is not None: output = self.norm(output) if output_hidden_states: all_hidden_states[-1] = output if not return_dict: return output return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=output, past_key_values=new_caches, hidden_states=all_hidden_states, attentions=all_attentions) def gen_cache(self, src): r""" Generates cache for `forward` usage. The generated cache is a list, and each element in it is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache` for more details. Parameters: src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. Returns: list: It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache` for more details. """ cache = [layer.gen_cache(src) for layer in self.layers] return cache ================================================ FILE: ppfleetx/models/language_model/ernie/layers/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import functools import inspect import warnings import paddle from paddle.nn import Layer def fn_args_to_dict(func, *args, **kwargs): """ Inspect function `func` and its arguments for running, and extract a dict mapping between argument names and keys. """ if hasattr(inspect, 'getfullargspec'): (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _, _) = inspect.getfullargspec(func) else: (spec_args, spec_varargs, spec_varkw, spec_defaults) = inspect.getargspec(func) # add positional argument values init_dict = dict(zip(spec_args, args)) # add default argument values kwargs_dict = dict(zip(spec_args[-len(spec_defaults):], spec_defaults)) if spec_defaults else {} for k in list(kwargs_dict.keys()): if k in init_dict: kwargs_dict.pop(k) kwargs_dict.update(kwargs) init_dict.update(kwargs_dict) return init_dict def adapt_stale_fwd_patch(self, name, value): """ Since there are some monkey patches for forward of PretrainedModel, such as model compression, we make these patches compatible with the latest forward method. """ if name == "forward": # NOTE(guosheng): In dygraph to static, `layer.forward` would be patched # by an instance of `StaticFunction`. And use string compare to avoid to # import fluid. if type(value).__name__.endswith('StaticFunction'): return value if hasattr(inspect, 'getfullargspec'): (patch_spec_args, patch_spec_varargs, patch_spec_varkw, patch_spec_defaults, _, _, _) = inspect.getfullargspec(value) (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _, _) = inspect.getfullargspec(self.forward) else: (patch_spec_args, patch_spec_varargs, patch_spec_varkw, patch_spec_defaults) = inspect.getargspec(value) (spec_args, spec_varargs, spec_varkw, spec_defaults) = inspect.getargspec(self.forward) new_args = [ arg for arg in ('output_hidden_states', 'output_attentions', 'return_dict') if arg not in patch_spec_args and arg in spec_args ] if new_args: if self.__module__.startswith("paddlenlp"): warnings.warn( f"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} is patched and the patch " "might be based on an old oversion which missing some " f"arguments compared with the latest, such as {new_args}. " "We automatically add compatibility on the patch for " "these arguemnts, and maybe the patch should be updated.") else: warnings.warn( f"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} " "is patched and the patch might be conflict with patches made " f"by paddlenlp which seems have more arguments such as {new_args}. " "We automatically add compatibility on the patch for " "these arguemnts, and maybe the patch should be updated.") if isinstance(self, Layer) and inspect.isfunction(value): @functools.wraps(value) def wrap_fwd(*args, **kwargs): for arg in new_args: kwargs.pop(arg, None) return value(self, *args, **kwargs) else: @functools.wraps(value) def wrap_fwd(*args, **kwargs): for arg in new_args: kwargs.pop(arg, None) return value(*args, **kwargs) return wrap_fwd return value class InitTrackerMeta(type(Layer)): """ This metaclass wraps the `__init__` method of a class to add `init_config` attribute for instances of that class, and `init_config` use a dict to track the initial configuration. If the class has `_pre_init` or `_post_init` method, it would be hooked before or after `__init__` and called as `_pre_init(self, init_fn, init_args)` or `_post_init(self, init_fn, init_args)`. Since InitTrackerMeta would be used as metaclass for pretrained model classes, which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)` rather than `type` as base class for it to avoid inheritance metaclass conflicts. """ def __init__(cls, name, bases, attrs): init_func = cls.__init__ # If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`. # Otherwise, no need to wrap again since the super cls has been wraped. # TODO: remove reduplicated tracker if using super cls `__init__` pre_init_func = getattr(cls, '_pre_init', None) if '__init__' in attrs else None post_init_func = getattr(cls, '_post_init', None) if '__init__' in attrs else None cls.__init__ = InitTrackerMeta.init_and_track_conf( init_func, pre_init_func, post_init_func) super(InitTrackerMeta, cls).__init__(name, bases, attrs) @staticmethod def init_and_track_conf(init_func, pre_init_func=None, post_init_func=None): """ wraps `init_func` which is `__init__` method of a class to add `init_config` attribute for instances of that class. Args: init_func (callable): It should be the `__init__` method of a class. pre_init_func (callable, optional): If provided, it would be hooked after `init_func` and called as `pre_init_func(self, init_func, *init_args, **init_args)`. Default None. post_init_func (callable, optional): If provided, it would be hooked after `init_func` and called as `post_init_func(self, init_func, *init_args, **init_args)`. Default None. Returns: function: the wrapped function """ @functools.wraps(init_func) def __impl__(self, *args, **kwargs): # registed helper by `pre_init_func` if pre_init_func: pre_init_func(self, init_func, *args, **kwargs) # keep full configuration init_func(self, *args, **kwargs) # registed helper by `post_init_func` if post_init_func: post_init_func(self, init_func, *args, **kwargs) self.init_config = kwargs if args: kwargs['init_args'] = args kwargs['init_class'] = self.__class__.__name__ return __impl__ def __setattr__(self, name, value): value = adapt_stale_fwd_patch(self, name, value) return super(InitTrackerMeta, self).__setattr__(name, value) ================================================ FILE: ppfleetx/models/language_model/gpt/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .dygraph.hybrid_model import ( GPTModelHybrid, GPTForPretrainingPipe, GPTPretrainingCriterionHybird, GPTForPretrainingHybrid, GPTForGenerationHybrid) from .auto.auto_model import (GPTModelAuto, GPTForPretrainingAuto, GPTPretrainingCriterionAuto, GPTForGenerationAuto) from .dygraph.single_model import GPTForPretraining, GPTPretrainingCriterion, GPTModel, GPTForGeneration, GPTForSequenceClassification ================================================ FILE: ppfleetx/models/language_model/gpt/auto/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/gpt/auto/auto_model.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import paddle import paddle.nn as nn import paddle.nn.functional as F import paddle.tensor as tensor import paddle.incubate as incubate import paddle.distributed.auto_parallel as auto from paddle.fluid import layers from paddle.common_ops_import import convert_dtype from paddle.nn.layer.transformer import _convert_param_attr_to_list from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from ..dygraph.processor import ( LogitsProcessorList, MinLengthLogitsProcessor, HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor) class MultiHeadAttention(nn.Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. """ Cache = collections.namedtuple("Cache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None, fuse_attn_qkv=False, use_recompute=False, recompute_granularity="full", mesh=None, mesh_idx=None): super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.fuse_attn_qkv = fuse_attn_qkv self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.mesh = mesh self.mesh_idx = mesh_idx self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if self.fuse_attn_qkv: assert self.kdim == embed_dim assert self.vdim == embed_dim self.qkv_proj = nn.Linear( embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr) else: self.q_proj = nn.Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) self.k_proj = nn.Linear( self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) self.v_proj = nn.Linear( self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) self.out_proj = nn.Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): auto.shard_tensor(self.qkv_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) mix_layer = self.qkv_proj(query) mix_layer = paddle.reshape_(mix_layer, [0, 0, self.num_heads, 3 * self.head_dim]) mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) assert not isinstance( cache, self.StaticCache ), "cache currently does not support the StaticCache type" if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=2) v = tensor.concat([cache.v, v], axis=2) if use_cache is True: cache = self.Cache(k, v) return (q, k, v) if use_cache is False else (q, k, v, cache) def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. """ auto.shard_tensor(self.q_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) q = self.q_proj(query) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=2) v = tensor.concat([cache.v, v], axis=2) if use_cache is True: cache = self.Cache(k, v) return (q, k, v) if use_cache is False else (q, k, v, cache) def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. """ auto.shard_tensor(self.k_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) auto.shard_tensor(self.v_proj.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) k = self.k_proj(key) v = self.v_proj(value) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) return k, v def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) def core_attn(self, q, k, v, attn_mask=None): # scale dot product attention product = paddle.matmul( x=q, y=k, transpose_y=True) * self.head_dim**-0.5 if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) else: weights = incubate.softmax_mask_fuse_upper_triangle(product) if self.dropout: # with get_rng_state_tracker().rng_state('local_seed'): weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) return out, weights def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if use_cache is False: if self.fuse_attn_qkv: q, k, v = self._fuse_prepare_qkv(query, use_cache, cache) else: q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) else: if self.fuse_attn_qkv: q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) if self.use_recompute and self.recompute_granularity == "core_attn": out, weights = auto.recompute(self.core_attn)(q, k, v, attn_mask=attn_mask) else: out, weights = self.core_attn(q, k, v, attn_mask=attn_mask) auto.shard_tensor(self.out_proj.weight, self.mesh[self.mesh_idx], [self.mesh.mp, None]) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs) class TransformerDecoder(nn.Layer): """ TransformerDecoder is a stack of N decoder layers. """ def __init__(self, decoder_layers, num_layers, norm=None, hidden_size=None, use_recompute=False, recompute_granularity="full"): super(TransformerDecoder, self).__init__() self.num_layers = num_layers self.layers = decoder_layers self.norm = norm self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity if norm == "LayerNorm": self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5) elif norm is not None: raise ValueError("Only support LayerNorm") def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, use_cache=False, cache=None): r""" Applies a stack of N Transformer decoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last decoder layer. """ output = tgt new_caches = [] for i, mod in enumerate(self.layers): auto.shard_tensor( output, mod.mesh[mod.mesh_idx], [mod.mesh.dp] + [None for i in range(len(output.shape) - 1)]) if cache is None: if use_cache: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache) new_caches.append(new_cache) else: if self.use_recompute and self.recompute_granularity == "full": output = auto.recompute(mod)(output, memory, tgt_mask, use_cache, cache) else: output = mod(output, memory, tgt_mask, use_cache, cache) else: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache[i]) new_caches.append(new_cache) if self.norm is not None: output = self.norm(output) return output if use_cache is False else (output, new_caches) def gen_cache(self, memory, do_zip=False): r""" Generates cache for `forward` usage. The generated cache is a list, and each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` for more details. If `do_zip` is True, apply `zip` on these tuples to get a list with two elements. """ cache = [layer.gen_cache(memory) for layer in self.layers] if do_zip: cache = list(zip(*cache)) return cache class TransformerDecoderLayer(nn.Layer): """ The transformer decoder layer. It contains multiheadattention and some linear layers. """ def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, bias_attr=None, fuse_attn_qkv=False, use_recompute=False, recompute_granularity="full", mesh=None, mesh_idx=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.mesh = mesh self.mesh_idx = mesh_idx weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], fuse_attn_qkv=fuse_attn_qkv, use_recompute=use_recompute, recompute_granularity=recompute_granularity, mesh=mesh, mesh_idx=mesh_idx) self.linear1 = nn.Linear( d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = nn.Linear( dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation) def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): auto.shard_tensor(self.linear1.weight, self.mesh[self.mesh_idx], [None, self.mesh.mp]) auto.shard_tensor(self.linear2.weight, self.mesh[self.mesh_idx], [self.mesh.mp, None]) residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: if self.use_recompute and self.recompute_granularity == "full_attn": tgt = auto.recompute(self.self_attn)(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) # with get_rng_state_tracker().rng_state('global_seed'): tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) # with get_rng_state_tracker().rng_state('global_seed'): tgt = self.dropout2( self.linear2(F.gelu( self.linear1(tgt), approximate=True))) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache) def gen_cache(self, memory): incremental_cache = self.self_attn.gen_cache( memory, type=self.self_attn.Cache) return incremental_cache class GPTEmbeddings(nn.Layer): """ Include embeddings from word and position embeddings. """ def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, mesh=None): super(GPTEmbeddings, self).__init__() self.mesh = mesh self.word_embeddings = nn.Embedding( vocab_size, hidden_size, weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range))) self.position_embeddings = nn.Embedding( max_position_embeddings, hidden_size, weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range))) self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, position_ids=None): if position_ids is None: ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones auto.shard_tensor(self.word_embeddings.weight, self.mesh[0], [self.mesh.mp, None]) input_embedings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) embeddings = input_embedings + position_embeddings embeddings = self.dropout(embeddings) return embeddings class GPTModelAuto(nn.Layer): def __init__(self, vocab_size=51200, hidden_size=768, num_layers=12, num_attention_heads=12, ffn_hidden_size=3072, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, fuse_attn_qkv=False, use_recompute=False, recompute_granularity="full", mesh=None): super(GPTModelAuto, self).__init__() self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity if not mesh: raise RuntimeError( "AutoPrallel modeling need `mesh` to annotate distributed attribute." ) self.mesh = mesh self.embeddings = GPTEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, self.initializer_range, self.mesh) stages = self.mesh.stages(num_layers) decoder_layers = nn.LayerList() for i in range(num_layers): decoder_layers.append( TransformerDecoderLayer( d_model=hidden_size, nhead=num_attention_heads, dim_feedforward=ffn_hidden_size, dropout=hidden_dropout_prob, activation="gelu", attn_dropout=attention_probs_dropout_prob, act_dropout=hidden_dropout_prob, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range)), bias_attr=None, fuse_attn_qkv=fuse_attn_qkv, use_recompute=use_recompute, recompute_granularity=recompute_granularity, mesh=self.mesh, mesh_idx=stages[i])) self.decoder = TransformerDecoder( decoder_layers, num_layers, norm="LayerNorm", hidden_size=hidden_size, use_recompute=use_recompute, recompute_granularity=recompute_granularity) def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=False, cache=None): if position_ids is None: past_length = 0 if cache is not None: past_length = paddle.shape(attention_mask)[-1] - 1 position_ids = paddle.arange( past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) input_ids.stop_gradient = True position_ids.stop_gradient = True auto.shard_tensor( input_ids, self.mesh[0], [self.mesh.dp] + [None for i in range(len(input_ids.shape) - 1)]) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) if self.training == False: # TODO, use registered buffer causal_mask = paddle.tensor.triu( paddle.ones( (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1) if attention_mask is not None: if len(attention_mask.shape) == 2: attention_mask = attention_mask[:, None, None, :] attention_mask = attention_mask + causal_mask else: attention_mask = causal_mask # The tensor returned by triu not in static graph. attention_mask.stop_gradient = True encoder_outputs = self.decoder( embedding_output, memory=None, tgt_mask=None if self.training else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, cache=cache) return encoder_outputs class GPTForPretrainingAuto(nn.Layer): """ GPT Model with pretraining tasks on top. Args: gpt (:class:`GPTModel`): An instance of :class:`GPTModel`. """ def __init__(self, gpt): super(GPTForPretrainingAuto, self).__init__() self.gpt = gpt def forward(self, input_ids, position_ids=None, attention_mask=None, masked_positions=None, use_cache=False, cache=None): outputs = self.gpt(input_ids, position_ids=position_ids, attention_mask=attention_mask, use_cache=use_cache, cache=cache) if use_cache: encoder_outputs, cached_kvs = outputs[:2] else: encoder_outputs = outputs x_dims_mapping = [self.gpt.mesh.dp] + [ None for i in range(len(encoder_outputs.shape) - 1) ] w_dims_mapping = [self.gpt.mesh.mp, None] matmul = auto.shard_op(paddle.matmul, self.gpt.mesh[-1], [x_dims_mapping, w_dims_mapping, None]) logits = matmul( encoder_outputs, self.gpt.embeddings.word_embeddings.weight, transpose_y=True) if use_cache: return logits, cached_kvs else: return logits class GPTPretrainingCriterionAuto(nn.Layer): """ Criterion for GPT. It calculates the final loss. """ def __init__(self, mesh): super(GPTPretrainingCriterionAuto, self).__init__() self.mesh = mesh self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none") def forward(self, prediction_scores, masked_lm_labels, loss_mask): """ Args: prediction_scores(Tensor): The logits of masked token prediction. Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size]. masked_lm_labels(Tensor): The labels of the masked language modeling, the dimensionality of `masked_lm_labels` is equal to `prediction_scores`. Its data type should be int64 and its shape is [batch_size, sequence_length, 1]. loss_mask(Tensor): Mask used for calculating the loss of the masked language modeling to avoid calculating some unwanted tokens. Its data type should be float32 and its shape is [batch_size, sequence_length, 1]. Returns: Tensor: The pretraining loss. Its data type should be float32 and its shape is [1]. """ masked_lm_labels.stop_gradient = True loss_mask.stop_gradient = True auto.shard_tensor( loss_mask, self.mesh[-1], [self.mesh.dp] + [None for i in range(len(loss_mask.shape) - 1)]) masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2)) loss_mask = loss_mask.reshape([-1]) masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) loss = masked_lm_loss / loss_mask.sum() return loss class GPTForGenerationAuto(nn.Layer): """ GPT Model with pretraining tasks on top. Args: gpt (:class:`GPTModel`): An instance of :class:`GPTModel`. """ def __init__(self, gpt, configs): super(GPTForGenerationAuto, self).__init__() self.gpt = gpt self.configs = configs self.max_length = self.configs.get('max_dec_len', 20) self.min_length = self.configs.get('min_dec_len', 0) self.decode_strategy = self.configs.get('decode_strategy', 'sampling') self.early_finish = self.configs.get('early_finish', True) self.temperature = self.configs.get('temperature', 1.0) self.top_k = self.configs.get('top_k', 0) self.top_p = self.configs.get('top_p', 1.0) self.use_topp_sampling = self.configs.get('use_topp_sampling', False) self.inference = self.configs.get('inference', False) self.repetition_penalty = self.configs.get('repetition_penalty', 1.0) self.num_beams = self.configs.get('num_beams', 1) self.num_beam_groups = self.configs.get('num_beam_groups', 1) self.length_penalty = self.configs.get('length_penalty', 0.0) self.early_stopping = self.configs.get('early_stopping', False) self.bos_token_id = self.configs.get('bos_token_id', None) self.eos_token_id = self.configs.get('eos_token_id', None) self.pad_token_id = self.configs.get('pad_token_id', None) self.decoder_start_token_id = self.configs.get( 'decoder_start_token_id', None) self.forced_bos_token_id = self.configs.get('forced_bos_token_id', None) self.forced_eos_token_id = self.configs.get('forced_eos_token_id', None) self.num_return_sequences = self.configs.get('num_return_sequences', 1) self.diversity_rate = self.configs.get('diversity_rate', 0.0) self.use_cache = self.configs.get('use_cache', True) def prepare_input_ids_for_generation(self, bos_token_id, encoder_output=None): batch_size = 1 if bos_token_id is None: raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.") if encoder_output is not None: batch_size = encoder_output.shape[0] return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id): is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any( input_ids == pad_token_id).numpy().item() is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( (eos_token_id is not None) and (pad_token_id != eos_token_id)) if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: attention_mask = (input_ids == pad_token_id ).astype(paddle.get_default_dtype()) * -1e9 else: attention_mask = paddle.zeros_like( input_ids, dtype=paddle.get_default_dtype()) return paddle.unsqueeze(attention_mask, axis=[1, 2]) def update_scores_for_generation(self, scores, next_scores, length, unfinished_flag): # update scores unfinished_scores = (scores * length + next_scores) / (length + 1) scores = paddle.where(unfinished_flag, unfinished_scores, scores) return scores def get_logits_processor(self, min_length=None, max_length=None, eos_token_id=None, forced_bos_token_id=None, forced_eos_token_id=None, num_beams=1, num_beam_groups=1, diversity_rate=0.0, repetition_penalty=None): processors = LogitsProcessorList() if min_length is not None and eos_token_id is not None and min_length > -1: processors.append( MinLengthLogitsProcessor(min_length, eos_token_id)) if num_beam_groups > 1 and diversity_rate > 0.0: processors.append( HammingDiversityLogitsProcessor( diversity_rate=diversity_rate, num_beams=num_beams, num_beam_groups=num_beam_groups)) if repetition_penalty is not None and repetition_penalty != 1.0: processors.append( RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) if forced_bos_token_id is not None: processors.append( ForcedBOSTokenLogitsProcessor(forced_bos_token_id)) if forced_eos_token_id is not None: processors.append( ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)) # TODO # Add more pre_processing for distribution return processors def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): index = paddle.tile( paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) if attention_mask is not None: model_kwargs["attention_mask"] = paddle.gather(attention_mask, index) if "token_type_ids" in model_kwargs and model_kwargs[ "token_type_ids"] is not None: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.gather(token_type_ids, index) if "position_ids" in model_kwargs and model_kwargs[ "position_ids"] is not None: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = paddle.gather(position_ids, index) if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None: seq_len = model_kwargs["seq_len"] model_kwargs["seq_len"] = paddle.gather(seq_len, index) if "encoder_output" in model_kwargs and model_kwargs[ "encoder_output"] is not None: encoder_output = model_kwargs["encoder_output"] model_kwargs["encoder_output"] = paddle.gather(encoder_output, index) if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.gather(role_ids, index) return input_ids, model_kwargs def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs): # only last token for inputs_ids if cache is defined in kwargs position_ids = kwargs.get("position_ids", None) attention_mask = kwargs.get("attention_mask", None) if attention_mask is not None: if len(attention_mask.shape) == 4: attention_mask = attention_mask[:, -1, -1, :] if "int" in paddle.common_ops_import.convert_dtype( attention_mask.dtype): attention_mask = (1.0 - attention_mask) * -1e4 return { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask, "cache": cache } def update_model_kwargs_for_generation(self, next_tokens, outputs, model_kwargs, is_encoder_decoder=False): # Update the model inputs during generation. # Note that If `token_type_ids` and `attention_mask` in `model_kwargs` # and they contain pad value, the result vectors updated by this method # may be different from expected. In this case, you need to rewrite the # method. # update cache if isinstance(outputs, tuple): model_kwargs["cache"] = outputs[1] # update token_type_ids with last value if "token_type_ids" in model_kwargs and model_kwargs[ "token_type_ids"] is not None: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.concat( [token_type_ids, token_type_ids[:, -1:]], axis=-1) # update position_ids if "position_ids" in model_kwargs and model_kwargs[ "position_ids"] is not None: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = position_ids[:, -1:] + 1 # update attention_mask if not is_encoder_decoder and "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] # nn.Pad2D don't support the data type `bool` if convert_dtype(attention_mask.dtype) == 'bool': attention_mask = paddle.cast(attention_mask, 'int64') if len(attention_mask.shape) == 4: attention_mask = nn.Pad2D( [0, 0, 0, 1], mode='replicate')(attention_mask) attention_mask = nn.Pad2D( [0, 1, 0, 0], value=-1e4)(attention_mask) dtype = convert_dtype(attention_mask.dtype) if 'int' in dtype: attention_mask[:, :, -1, -1] = 1 elif 'float' in dtype: attention_mask[:, :, -1, -1] = 0.0 else: raise ValueError( 'The data type of input `attention_mask` must ' 'be bool, int or float') else: attention_mask = paddle.concat( [ attention_mask, paddle.ones( [attention_mask.shape[0], 1], dtype="int64") ], axis=-1) model_kwargs["attention_mask"] = attention_mask # update role_ids if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.concat( [role_ids, role_ids[:, -1:]], axis=-1) model_kwargs['res'] = paddle.concat( [model_kwargs['res'], next_tokens], axis=1) return model_kwargs def sample(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, top_k=None, top_p=None, temperature=None, min_tokens_to_keep=1, **model_kwargs): def TopKProcess(probs, top_k, min_tokens_to_keep): top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1]) # Remove all tokens with a probability less than the last token of the top-k topk_probs, _ = paddle.topk(probs, k=top_k) probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0)) return probs def TopPProcess(probs, top_p, min_tokens_to_keep): sorted_probs = paddle.sort(probs, descending=True) sorted_indices = paddle.argsort(probs, descending=True) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) # Remove tokens with cumulative probs above the top_p, But keep at # least min_tokens_to_keep tokens sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Set 'min_tokens_to_keep - 1' because the first token is kept sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0 # Keep the first token sorted_indices_to_remove = paddle.cast( sorted_indices_to_remove, dtype='int64') sorted_indices_to_remove[:, 1:] = ( sorted_indices_to_remove[:, :-1].clone()) sorted_indices_to_remove[:, 0] = 0 # Scatter sorted tensors to original indexing sorted_indices = sorted_indices + paddle.arange(probs.shape[ 0]).unsqueeze(-1) * probs.shape[-1] condition = paddle.scatter(sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()) condition = paddle.cast(condition, 'bool').reshape(probs.shape) probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs batch_size, cur_len = paddle.shape(input_ids) # used for compute on gpu, avoid memcpy D2H cur_len_gpu = paddle.full([1], cur_len, dtype='int64') origin_len = paddle.shape(input_ids)[1] # used for compute on gpu, avoid memcpy D2H origin_len_gpu = paddle.full([1], origin_len, dtype='int64') unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool') scores = paddle.full( [batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) res = paddle.assign(input_ids) model_kwargs['res'] = res # use_cache is immutable, we split it off other mutable kwargs. assert 'use_cache' in model_kwargs immutable = {'use_cache': model_kwargs['use_cache']} del model_kwargs['use_cache'] def _forward_(**args): model_inputs = self.prepare_inputs_for_generation( input_ids, **args, **immutable) return self.gpt(**model_inputs, **immutable) def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs): logits = outputs[0] if isinstance(outputs, tuple) else outputs # logits = paddle.matmul( # logits, # self.gpt.embeddings.word_embeddings.weight, # transpose_y=True) x_dims_mapping = [self.gpt.mesh.dp] + [ None for i in range(len(logits.shape) - 1) ] w_dims_mapping = [self.gpt.mesh.mp, None] matmul = auto.shard_op(paddle.matmul, self.gpt.mesh[-1], [x_dims_mapping, w_dims_mapping, None]) with paddle.fluid.name_scope('skip_quant'): logits = matmul( logits, self.gpt.embeddings.word_embeddings.weight, transpose_y=True) # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = logits_processors(input_ids, logits) # sample origin_probs = F.softmax(logits) if temperature is None or temperature == 1.0: probs = paddle.assign(origin_probs) origin_probs = paddle.log(origin_probs) else: origin_probs = paddle.log(origin_probs) logits = logits / temperature probs = F.softmax(logits) if top_k is not None and top_k != 0: probs = TopKProcess(probs, top_k, min_tokens_to_keep) if top_p is not None and top_p < 1.0: if self.use_topp_sampling: try: from ppfleetx_ops import topp_sampling except ImportError: raise ImportError( "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!" ) top_ps_tensor = paddle.full( shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype) # TODO fake random seed here # Users should set the random seed dynamically when inference _, next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100) else: probs = TopPProcess(probs, top_p, min_tokens_to_keep) if not self.use_topp_sampling: next_tokens = paddle.multinomial(probs) next_scores = paddle.index_sample(origin_probs, next_tokens) if eos_token_id is not None: next_tokens = paddle.where( unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) scores = self.update_scores_for_generation( scores, next_scores, cur_len - origin_len, unfinished_flag) input_ids = next_tokens if eos_token_id is not None: unfinished_flag = paddle.logical_and( unfinished_flag, next_tokens != eos_token_id) model_kwargs = self.update_model_kwargs_for_generation( next_tokens, outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder) return input_ids, scores, unfinished_flag, model_kwargs # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement # the value in model_kwargs should be tensor before while loop outputs = _forward_(**model_kwargs) input_ids, scores, unfinished_flag, model_kwargs = _post_process_( outputs, input_ids, cur_len_gpu, origin_len_gpu, scores, unfinished_flag, model_kwargs) if not self.inference: cur_len += 1 else: # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static paddle.increment(cur_len) paddle.increment(cur_len_gpu) attn_mask = model_kwargs['attention_mask'] # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static. model_kwargs['attention_mask'] = paddle.reshape( attn_mask, paddle.shape(attn_mask)) model_kwargs['cache'] = outputs[1] if isinstance(outputs, tuple) else None max_length = paddle.to_tensor(max_length) while cur_len < max_length: # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs) # and change it to pass directly to _post_process_ to avoid # closed-loop problem of dynamic-to-static model input_ids, scores, unfinished_flag, model_kwargs = _post_process_( _forward_(**model_kwargs), input_ids, cur_len_gpu, origin_len_gpu, scores, unfinished_flag, model_kwargs) if not self.inference: cur_len += 1 else: # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static paddle.increment(cur_len) paddle.increment(cur_len_gpu) # early finish should be True in generation scenes, # If users want to test the inference speed, you can just set it False. if self.early_finish and not paddle.any(unfinished_flag): break return model_kwargs['res'][:, origin_len:], scores def forward(self, input_ids=None, **model_kwargs): max_length = self.max_length min_length = self.min_length decode_strategy = self.decode_strategy temperature = self.temperature top_k = self.top_k top_p = self.top_p repetition_penalty = self.repetition_penalty num_beams = self.num_beams num_beam_groups = self.num_beam_groups length_penalty = self.length_penalty early_stopping = self.early_stopping bos_token_id = self.bos_token_id eos_token_id = self.eos_token_id pad_token_id = self.pad_token_id decoder_start_token_id = self.decoder_start_token_id forced_bos_token_id = self.forced_bos_token_id forced_eos_token_id = self.forced_eos_token_id num_return_sequences = self.num_return_sequences diversity_rate = self.diversity_rate use_cache = self.use_cache assert ( decode_strategy in ["greedy_search", "sampling", "beam_search"] ), "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format( decode_strategy) bos_token_id = bos_token_id if bos_token_id is not None else getattr( self.gpt, 'bos_token_id', None) eos_token_id = eos_token_id if eos_token_id is not None else getattr( self.gpt, 'eos_token_id', None) pad_token_id = pad_token_id if pad_token_id is not None else getattr( self.gpt, 'pad_token_id', None) forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr( self.gpt, 'forced_bos_token_id', None) forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr( self.gpt, 'forced_eos_token_id', None) decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr( self.gpt, 'decoder_start_token_id', None) # params check if input_ids is None: # Init `input_ids` with bos_token_id input_ids = self.prepare_input_ids_for_generation(bos_token_id) if model_kwargs.get("attention_mask", None) is None: # TODO # Init `attention_mask` depending on `pad_token_id` model_kwargs[ "attention_mask"] = self.prepare_attention_mask_for_generation( input_ids, pad_token_id, eos_token_id) if model_kwargs.get("position_ids", None) is None: model_kwargs['position_ids'] = paddle.arange( 0, paddle.shape(model_kwargs['attention_mask'])[-1], dtype=input_ids.dtype).unsqueeze(0) self.is_encoder_decoder = False model_kwargs["use_cache"] = use_cache if self.inference: # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static min_len = input_ids.shape[-1] max_len = input_ids.shape[-1] paddle.increment(min_len, min_length) paddle.increment(max_len, max_length) else: input_len = input_ids.shape[-1] max_len = max_length + input_len min_len = min_length + input_len logits_processors = self.get_logits_processor( min_length=min_len, max_length=max_len, eos_token_id=eos_token_id, forced_bos_token_id=forced_bos_token_id, forced_eos_token_id=forced_eos_token_id, num_beams=num_beams, num_beam_groups=num_beam_groups, diversity_rate=diversity_rate, repetition_penalty=repetition_penalty) if decode_strategy == 'sampling': if num_return_sequences > 1: input_ids, model_kwargs = self.expand_inputs_for_generation( input_ids, expand_size=num_return_sequences, **model_kwargs) ret = self.sample(input_ids, logits_processors, max_len, pad_token_id, eos_token_id, top_k, top_p, temperature, **model_kwargs) else: raise ValueError(f'Not support {decode_strategy} strategy yet!') return ret ================================================ FILE: ppfleetx/models/language_model/gpt/auto/auto_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import copy import argparse import numpy as np import paddle import paddle.distributed as dist from paddle import LazyGuard from paddle.static import InputSpec from paddle.distributed.fleet import auto from ...auto_utils import process_configs import ppfleetx.models.language_model.gpt as gpt from ppfleetx.utils.log import logger from ppfleetx.data.tokenizers import GPTTokenizer from ppfleetx.core.module.basic_module import BasicModule class LanguageModuleAuto(BasicModule): def __init__(self, configs): self.nranks = dist.get_world_size() super(LanguageModuleAuto, self).__init__(configs) self.loss_fn = self.get_loss_fn() def process_configs(self, configs): configs = process_configs(configs) return configs def get_model_size(self, l, h, v, s): P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h)) logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) class GPTModuleAuto(LanguageModuleAuto): def __init__(self, configs): super(GPTModuleAuto, self).__init__(configs) def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_len self.get_model_size(l, h, v, s) self.tokenizer = GPTTokenizer.from_pretrained("gpt2") with LazyGuard(): model = gpt.GPTForPretrainingAuto( gpt.GPTModelAuto(**model_setting)) return model def get_loss_fn(self): model_setting = copy.deepcopy(self.configs.Model) return gpt.GPTPretrainingCriterionAuto(model_setting['mesh']) class GPTGenerationModuleAuto(BasicModule): def __init__(self, configs): self.configs = configs self.generation_cfgs = configs.Generation self.nranks = paddle.distributed.get_world_size() super().__init__(configs) def process_configs(self, configs): configs = process_configs(configs) return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") with LazyGuard(): model = gpt.GPTForGenerationAuto( gpt.GPTModelAuto(**model_setting), self.generation_cfgs) self.tokenizer = GPTTokenizer.from_pretrained("gpt2") self.generation_cfgs['max_dec_len'] = self.adjust_length_to_model( self.generation_cfgs['max_dec_len'], 512) self.generation_cfgs['bos_token_id'] = self.tokenizer.eos_token_id self.generation_cfgs['eos_token_id'] = self.tokenizer.eos_token_id self.generation_cfgs['pad_token_id'] = self.tokenizer.eos_token_id return model def adjust_length_to_model(self, length, max_sequence_length): if length < 0 or length > max_sequence_length: length = max_sequence_length return length def left_padding(self, inputs, pad_id, padding="longest"): assert "input_ids" in inputs, "input_ids should be in inputs!" max_length = 0 for ids in inputs["input_ids"]: max_length = max(max_length, len(ids)) def extend_max_lenth(value, max_length, to_pad_id): return [to_pad_id] * (max_length - len(value)) + value def extend_filed(name, max_length, to_pad_id): values = inputs[name] res = [] for index, value in enumerate(values): res.append(extend_max_lenth(value, max_length, to_pad_id)) inputs[name] = res extend_filed("input_ids", max_length, pad_id) if "attention_mask" in inputs: extend_filed("attention_mask", max_length, 0) if "position_ids" in inputs: extend_filed("position_ids", max_length, 0) return inputs def input_spec(self): return [InputSpec(shape=[None, None], name="input_ids", dtype='int64')] ================================================ FILE: ppfleetx/models/language_model/gpt/dygraph/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import logging from distutils.util import strtobool import os import math import paddle import paddle.nn as nn import paddle.nn.functional as F import paddle.tensor as tensor from paddle.fluid import layers from paddle.nn.layer.transformer import _convert_param_attr_to_list import paddle.incubate as incubate from paddle.common_ops_import import convert_dtype import paddle.distributed.fleet as fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer, SharedLayerDesc from paddle.distributed.fleet.utils import recompute from paddle.autograd import PyLayer import sys from .single_model import ExpertLayer from .sequence_parallel_utils import ScatterOp, GatherOp, \ mark_as_sequence_parallel_parameter, ColumnSequenceParallelLinear, RowSequenceParallelLinear from .processor import ( LogitsProcessorList, MinLengthLogitsProcessor, HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor) from ppfleetx.models.language_model.moe import MoELayer from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger import numpy as np try: from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None def get_attr(layer, name): if getattr(layer, name, None) is not None: return getattr(layer, name, None) else: return get_attr(layer._layer, name) def parallel_matmul(lm_output, logit_weights, parallel_output): """ """ hcg = env.get_hcg() model_parallel_group = hcg.get_model_parallel_group() world_size = hcg.get_model_parallel_world_size() rank = hcg.get_model_parallel_rank() if world_size > 1: input_parallel = paddle.distributed.collective._c_identity( lm_output, group=model_parallel_group) logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True) if parallel_output: return logits return paddle.distributed.collective._c_concat( logits, group=model_parallel_group) else: logits = paddle.matmul(lm_output, logit_weights, transpose_y=True) return logits class MultiHeadAttention(nn.Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. """ Cache = collections.namedtuple("Cache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, output_layer_weight_attr=None, bias_attr=None, fuse_attn_qkv=False, scale_qk_coeff=1.0, num_partitions=1, fused_linear=False, use_recompute=False, recompute_granularity="full", sequence_parallel=False, do_recompute=True, use_flash_attn=False): super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.fuse_attn_qkv = fuse_attn_qkv self.scale_qk_coeff = scale_qk_coeff self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.do_recompute = do_recompute self.sequence_parallel = sequence_parallel self.use_flash_attn = use_flash_attn if flash_attention else None if sequence_parallel: ColumnParallelLinear = ColumnSequenceParallelLinear RowParallelLinear = RowSequenceParallelLinear else: ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear RowParallelLinear = fleet.meta_parallel.RowParallelLinear self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" assert self.num_heads % num_partitions == 0, "num_heads {} must be divisible by num_partitions {}".format( self.num_heads, num_partitions) self.num_heads = self.num_heads // num_partitions if self.fuse_attn_qkv: assert self.kdim == embed_dim assert self.vdim == embed_dim self.qkv_proj = ColumnParallelLinear( embed_dim, 3 * embed_dim, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=weight_attr, has_bias=True, gather_output=False, fuse_matmul_bias=fused_linear) else: self.q_proj = ColumnParallelLinear( embed_dim, embed_dim, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=weight_attr, has_bias=True, gather_output=False, fuse_matmul_bias=fused_linear) self.k_proj = ColumnParallelLinear( self.kdim, embed_dim, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=weight_attr, has_bias=True, gather_output=False, fuse_matmul_bias=fused_linear) self.v_proj = ColumnParallelLinear( self.vdim, embed_dim, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=weight_attr, has_bias=True, gather_output=False, fuse_matmul_bias=fused_linear) self.out_proj = RowParallelLinear( embed_dim, embed_dim, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=output_layer_weight_attr, has_bias=True, input_is_parallel=True, fuse_matmul_bias=fused_linear) def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): mix_layer = self.qkv_proj(query) mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim]) q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) assert not isinstance( cache, self.StaticCache ), "cache currently does not support the StaticCache type" if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=1) v = tensor.concat([cache.v, v], axis=1) if use_cache is True: cache = self.Cache(k, v) return (q, k, v, cache) if use_cache else (q, k, v, None) def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. """ q = self.q_proj(query) q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=1) v = tensor.concat([cache.v, v], axis=1) if use_cache is True: cache = self.Cache(k, v) return (q, k, v, cache) if use_cache else (q, k, v, None) def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. """ k = self.k_proj(key) v = self.v_proj(value) k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim]) v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim]) return k, v def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) def _flash_attention(self, q, k, v, attn_mask=None): if self.sequence_parallel: perm = [1, 0, 2, 3] q = tensor.transpose(x=q, perm=perm) k = tensor.transpose(x=k, perm=perm) v = tensor.transpose(x=v, perm=perm) out, weights = flash_attention( q, k, v, self.dropout, causal=True, return_softmax=self.need_weights) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) if self.sequence_parallel: perm = [1, 0, 2] out = tensor.transpose(x=out, perm=perm) return out, weights def core_attn(self, q, k, v, attn_mask=None): perm = [1, 2, 0, 3] if self.sequence_parallel else [0, 2, 1, 3] q = tensor.transpose(x=q, perm=perm) k = tensor.transpose(x=k, perm=perm) v = tensor.transpose(x=v, perm=perm) # scale dot product attention scale_qk_coeff = self.scale_qk_coeff * self.head_dim**0.5 product = paddle.matmul( x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True) if self.scale_qk_coeff != 1.0: product = product.scale(self.scale_qk_coeff) # softmax_mask_fuse_upper_triangle is not supported sif paddle is not compiled with cuda/rocm if not paddle.is_compiled_with_cuda(): attn_mask = get_triangle_upper_mask(product, attn_mask) if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) else: weights = incubate.softmax_mask_fuse_upper_triangle(product) if self.dropout: with get_rng_state_tracker().rng_state('local_seed'): weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads if self.sequence_parallel: out = tensor.transpose(out, perm=[2, 0, 1, 3]) else: out = tensor.transpose(out, perm=[0, 2, 1, 3]) # If sequence_parallel is true, out shape is [s, b, h] after reshape # else out shape is [b, s, h] out = tensor.reshape(x=out, shape=[0, 0, -1]) return out, weights def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # if sequence_parallel is true, query, key, value shape are [s, b, h], # else their shape are [b, s, h], n is mp parallelism. # no matter sequence_parallel is true or false, # after reshape, q, k, v shape should be [b, num_heads/n, s, head_dim] # compute q ,k ,v if self.fuse_attn_qkv: q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) if self.use_flash_attn and attn_mask is None: attn_func = self._flash_attention else: attn_func = self.core_attn if self.use_recompute and self.recompute_granularity == "core_attn" and self.do_recompute: out, weights = recompute(attn_func, q, k, v, attn_mask) else: out, weights = attn_func(q, k, v, attn_mask=attn_mask) # project to output # if sequence_parallel is true, out shape are [s/n, b, h], # else their shape are [b, s, h], n is mp parallelism. out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs) class TransformerDecoder(nn.Layer): """ TransformerDecoder is a stack of N decoder layers. """ def __init__(self, decoder_layers, num_layers, norm=None, hidden_size=None, use_recompute=False, recompute_granularity="full", sequence_parallel=False, no_recompute_layers=None): super(TransformerDecoder, self).__init__() if no_recompute_layers is None: no_recompute_layers = [] self.no_recompute_layers = no_recompute_layers self.num_layers = num_layers self.layers = decoder_layers self.norm = norm self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.sequence_parallel = sequence_parallel if norm == "LayerNorm": self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5) # if sequence parallel is true, # register hook to all_reduce gradient of weight, bias if self.sequence_parallel: mark_as_sequence_parallel_parameter(self.norm.weight) mark_as_sequence_parallel_parameter(self.norm.bias) elif norm is not None: raise ValueError("Only support LayerNorm") def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, use_cache=False, cache=None): r""" Applies a stack of N Transformer decoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last decoder layer. """ output = tgt new_caches = [] for i, mod in enumerate(self.layers): if cache is None: if use_cache: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache) new_caches.append(new_cache) else: if self.use_recompute and self.recompute_granularity == "full" and i not in self.no_recompute_layers: output = recompute(mod, output, memory, tgt_mask, use_cache, cache) else: output = mod(output, memory, tgt_mask, use_cache, cache) else: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache[i]) new_caches.append(new_cache) if self.norm is not None: output = self.norm(output) return output if use_cache is False else (output, new_caches) def gen_cache(self, memory, do_zip=False): r""" Generates cache for `forward` usage. The generated cache is a list, and each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` for more details. If `do_zip` is True, apply `zip` on these tuples to get a list with two elements. """ cache = [layer.gen_cache(memory) for layer in self.layers] if do_zip: cache = list(zip(*cache)) return cache class TransformerDecoderLayer(nn.Layer): """ The transformer decoder layer. It contains multiheadattention and some linear layers. """ def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, weight_attr=None, output_layer_weight_attr=None, bias_attr=None, num_partitions=1, fused_linear=False, fuse_attn_qkv=False, scale_qk_coeff=1.0, moe_configs=None, recompute_attn=False, use_recompute=False, recompute_granularity="full", sequence_parallel=False, do_recompute=True, skip_quant_tensors=[], use_flash_attn=False): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.sequence_parallel = sequence_parallel self.do_recompute = do_recompute self.expert_mode = False # moe config if moe_configs is not None: self.gate = moe_configs.get('gate', 'gshard') self.top_k = moe_configs.get('top_k', 2) self.num_experts = moe_configs.get('num_experts', 1) self.expert_mode = moe_configs.get('expert_mode', False) if sequence_parallel: ColumnParallelLinear = ColumnSequenceParallelLinear RowParallelLinear = RowSequenceParallelLinear else: ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear RowParallelLinear = fleet.meta_parallel.RowParallelLinear weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) output_layer_weight_attrs = _convert_param_attr_to_list( output_layer_weight_attr, 3) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], output_layer_weight_attr=output_layer_weight_attrs[0], num_partitions=num_partitions, fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, scale_qk_coeff=scale_qk_coeff, use_recompute=use_recompute, recompute_granularity=recompute_granularity, sequence_parallel=sequence_parallel, do_recompute=do_recompute, use_flash_attn=use_flash_attn) if self.expert_mode: experts_list = nn.LayerList([ ExpertLayer(d_model, dim_feedforward) for e in range(self.num_experts) ]) hcg = env.get_hcg() moe_group = hcg.get_expert_parallel_group() mp_group = hcg.get_model_parallel_group() self.moe_mlp = MoELayer( d_model=d_model, experts=experts_list, gate=self.gate, top_k=self.top_k, moe_group=moe_group, mp_group=mp_group, recompute_interval=int(self.use_recompute)) else: self.linear1 = ColumnParallelLinear( d_model, dim_feedforward, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=weight_attrs[2], gather_output=False, has_bias=True, fuse_matmul_bias=fused_linear) self.linear2 = RowParallelLinear( dim_feedforward, d_model, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=output_layer_weight_attrs[2], input_is_parallel=True, has_bias=True, fuse_matmul_bias=fused_linear) if 'linear1' in skip_quant_tensors: self.linear1.skip_quant = True if 'linear2' in skip_quant_tensors: self.linear2.skip_quant = True self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) if self.sequence_parallel: # if sequence parallel is true, register hook to all_reduce gradient of bias mark_as_sequence_parallel_parameter(self.norm1.weight) mark_as_sequence_parallel_parameter(self.norm1.bias) mark_as_sequence_parallel_parameter(self.norm2.weight) mark_as_sequence_parallel_parameter(self.norm2.bias) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") self.activation = getattr(F, activation) def forward(self, tgt, memory=None, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: if self.use_recompute and self.recompute_granularity == "full_attn" and self.do_recompute: tgt = recompute(self.self_attn, tgt, None, None, tgt_mask, use_cache, cache) else: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) # If use sequence_parallel, different input partition in dropout # should use different seed. if self.sequence_parallel: current_seed = 'local_seed' else: current_seed = 'global_seed' with get_rng_state_tracker().rng_state(current_seed): tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) if self.expert_mode: tgt = self.moe_mlp(tgt) else: with get_rng_state_tracker().rng_state(current_seed): tgt = self.dropout2( self.linear2(F.gelu( self.linear1(tgt), approximate=True))) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache) def gen_cache(self, memory): incremental_cache = self.self_attn.gen_cache( memory, type=self.self_attn.Cache) return incremental_cache class GPTEmbeddings(nn.Layer): """ Include embeddings from word and position embeddings. """ def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, sequence_parallel=False, freeze_embedding=False): super(GPTEmbeddings, self).__init__() self.sequence_parallel = sequence_parallel self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding( vocab_size, hidden_size, mp_group=env.get_hcg().get_model_parallel_group(), weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range))) self.position_embeddings = nn.Embedding( max_position_embeddings, hidden_size, weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range))) if freeze_embedding: self.word_embeddings.weight.learning_rate = 0.0 self.position_embeddings.weight.learning_rate = 0.0 self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, position_ids=None): if position_ids is None: ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones input_embedings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) embeddings = input_embedings + position_embeddings # if sequence parallel is true, change embedding shape [b, s, h] to [s, b, h] # set the sequence dim as first, so the split in sequence dim is data-continuous if self.sequence_parallel: embeddings = paddle.transpose(embeddings, perm=[1, 0, 2]) embeddings = ScatterOp.apply(embeddings) with get_rng_state_tracker().rng_state('local_seed'): embeddings = self.dropout(embeddings) else: embeddings = self.dropout(embeddings) return embeddings class GPTModelHybrid(nn.Layer): def __init__(self, vocab_size=51200, hidden_size=768, num_layers=12, num_attention_heads=12, ffn_hidden_size=3072, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, num_partitions=1, moe_configs=None, use_recompute=False, fused_linear=False, fuse_attn_qkv=False, scale_qk_by_layer_num=True, recompute_granularity="full", sequence_parallel=False, no_recompute_layers=None, skip_tensor_map={}, freeze_embedding=False, use_flash_attn=False, fused_softmax_with_triangular=False): super(GPTModelHybrid, self).__init__() if no_recompute_layers is None: no_recompute_layers = [] self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size self.fused_softmax_with_triangular = fused_softmax_with_triangular if use_flash_attn: if flash_attention: logger.info("Flash-attention enabled.") else: use_flash_attn = False logger.warning( "Flash-attention is not support in this Paddle version.") hcg = env.get_hcg() mp_size = hcg.get_model_parallel_world_size() if mp_size <= 1: sequence_parallel = False logging.warning( "If mp_size <= 1, sequence_parallel strategy will be turned off in GPTModelHybrid model." ) self.embeddings = GPTEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, self.initializer_range, sequence_parallel, freeze_embedding) self.sequence_parallel = sequence_parallel decoder_layers = nn.LayerList() for i in range(num_layers): decoder_layers.append( TransformerDecoderLayer( d_model=hidden_size, nhead=num_attention_heads, dim_feedforward=ffn_hidden_size, dropout=hidden_dropout_prob, activation="gelu", attn_dropout=attention_probs_dropout_prob, act_dropout=hidden_dropout_prob, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range)), output_layer_weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range / math.sqrt( 2.0 * num_layers))), bias_attr=None, num_partitions=num_partitions, fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, scale_qk_coeff=num_layers if scale_qk_by_layer_num else 1.0, moe_configs=moe_configs, use_recompute=use_recompute, recompute_granularity=recompute_granularity, sequence_parallel=sequence_parallel, do_recompute=i not in no_recompute_layers, skip_quant_tensors=skip_tensor_map.get('block_{}'.format( i), []), use_flash_attn=use_flash_attn)) self.decoder = TransformerDecoder( decoder_layers, num_layers, norm="LayerNorm", hidden_size=hidden_size, use_recompute=use_recompute, recompute_granularity=recompute_granularity, sequence_parallel=sequence_parallel, no_recompute_layers=no_recompute_layers) def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=False, cache=None): if position_ids is None: past_length = 0 if cache is not None: past_length = paddle.shape(attention_mask)[-1] - 1 position_ids = paddle.arange( past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) # if sequence_parallel is true, embedding_output shape is [s/n, b, h] # else its shape is [b, s, h], n is mp parallelism embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( ): # TODO, use registered buffer causal_mask = paddle.tensor.triu( paddle.ones( (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1) if attention_mask is not None: if len(attention_mask.shape) == 2: attention_mask = attention_mask[:, None, None, :] attention_mask = attention_mask + causal_mask else: attention_mask = causal_mask # The tensor returned by triu not in static graph. attention_mask.stop_gradient = True encoder_outputs = self.decoder( embedding_output, memory=None, tgt_mask=None if (self.fused_softmax_with_triangular and self.training and paddle.is_compiled_with_cuda()) else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, cache=cache) if self.sequence_parallel: encoder_outputs = GatherOp.apply(encoder_outputs) return encoder_outputs class GPTForPretrainingHybrid(nn.Layer): """ GPT Model with pretraining tasks on top. Args: gpt (:class:`GPTModel`): An instance of :class:`GPTModel`. """ def __init__(self, gpt): super(GPTForPretrainingHybrid, self).__init__() self.gpt = gpt # extra_parameters using for sharding stage3 to register extra_parameters self.extra_parameters = [ get_attr(self.gpt.embeddings.word_embeddings, "weight") ] def forward(self, input_ids, position_ids=None, attention_mask=None, masked_positions=None, use_cache=False, cache=None): outputs = self.gpt(input_ids, position_ids=position_ids, attention_mask=attention_mask, use_cache=use_cache, cache=cache) if use_cache: encoder_outputs, cached_kvs = outputs[:2] else: encoder_outputs = outputs logits = parallel_matmul( encoder_outputs, get_attr(self.gpt.embeddings.word_embeddings, "weight"), True) if use_cache: return logits, cached_kvs else: return logits class GPTPretrainingCriterionHybird(nn.Layer): """ Criterion for GPT. It calculates the final loss. """ def __init__(self, topo=None, sequence_parallel=False): super(GPTPretrainingCriterionHybird, self).__init__() self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none") self.parallel_loss_func = \ fleet.meta_parallel.ParallelCrossEntropy(mp_group=env.get_hcg().get_model_parallel_group()) self.sequence_parallel = sequence_parallel def forward(self, prediction_scores, masked_lm_labels, loss_mask): """ Args: prediction_scores(Tensor): The logits of masked token prediction. Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size]. masked_lm_labels(Tensor): The labels of the masked language modeling, the dimensionality of `masked_lm_labels` is equal to `prediction_scores`. Its data type should be int64 and its shape is [batch_size, sequence_length, 1]. loss_mask(Tensor): Mask used for calculating the loss of the masked language modeling to avoid calculating some unwanted tokens. Its data type should be float32 and its shape is [batch_size, sequence_length, 1]. Returns: Tensor: The pretraining loss. Its data type should be float32 and its shape is [1]. """ hcg = env.get_hcg() mp_size = hcg.get_model_parallel_world_size() if self.sequence_parallel: masked_lm_labels = masked_lm_labels.transpose([1, 0]) loss_mask = loss_mask.transpose([1, 0]) if mp_size > 1: if paddle.is_compiled_with_cuda() and True: masked_lm_loss = self.parallel_loss_func( prediction_scores, masked_lm_labels.unsqueeze(2)) else: prediction_scores = ConcatSoftmaxInput.apply( prediction_scores, group=env.get_hcg().get_model_parallel_group()) masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2)) else: masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2)) loss_mask = loss_mask.reshape([-1]) masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) loss = masked_lm_loss / loss_mask.sum() return loss # these Layers is just for PipelineParallel class GPTPretrainingCriterionPipe(GPTPretrainingCriterionHybird): """Extends GPTPretrainingCriterion to meet the input standard.""" def forward(self, prediction_scores, args): masked_lm_labels = args[0] loss_mask = args[1] loss = super().forward(prediction_scores, masked_lm_labels, loss_mask) return loss class EmbeddingPipe(GPTEmbeddings): """Extends GPTEmbeddings to forward attention_mask through the pipeline.""" @property def embedding_weight(self): return get_attr(self.word_embeddings, "weight") def forward(self, tensors): input_ids, position_ids = tensors embeddings = super().forward( input_ids=input_ids, position_ids=position_ids) return embeddings class LayerNormPipe(nn.Layer): def __init__(self, normalized_shape, epsilon=1e-05, weight_attr=None, bias_attr=None, name=None, sequence_parallel=False, is_last=False): super(LayerNormPipe, self).__init__() self.sequence_parallel = sequence_parallel self.is_last = is_last self.norm = nn.LayerNorm( normalized_shape=normalized_shape, epsilon=epsilon, weight_attr=weight_attr, bias_attr=bias_attr, name=name) if self.sequence_parallel: mark_as_sequence_parallel_parameter(self.norm.weight) mark_as_sequence_parallel_parameter(self.norm.bias) def forward(self, input): output = self.norm(input) if self.sequence_parallel and self.is_last: output = GatherOp.apply(output) return output class GPTForPretrainingPipe(PipelineLayer): """GPTForPretraining adapted for pipeline parallelism. The largest change is flattening the GPTModel class so we can express it as a sequence of layers including embedding, transformer layers, and output. """ def __init__(self, vocab_size, hidden_size=768, num_layers=12, num_attention_heads=12, ffn_hidden_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, num_partitions=1, topology=None, use_recompute=False, fused_linear=False, fuse_attn_qkv=False, scale_qk_by_layer_num=True, moe_configs=None, recompute_granularity="full", virtual_pp_degree=1, sequence_parallel=False, no_recompute_layers=None, pp_recompute_interval=1, use_flash_attn=False, fused_softmax_with_triangular=False): # forward desc self.descs = [] if no_recompute_layers is None: no_recompute_layers = [] else: if recompute_granularity == 'full': assert len(no_recompute_layers) == 0, \ "for pp with full recompute, no_recompute_layers is not support" if use_flash_attn: if flash_attention: logger.info("Flash-attention enabled.") else: use_flash_attn = False logger.warning( "Flash-attention is not support in this Paddle version.") hcg = env.get_hcg() mp_size = hcg.get_model_parallel_world_size() if mp_size <= 1: sequence_parallel = False logging.warning( "If mp_size <= 1, sequence_parallel strategy will be turned off in GPTForPretrainingPipe model." ) self.descs.append( SharedLayerDesc( 'embed', EmbeddingPipe, shared_weight_attr='embedding_weight', vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=0.02, sequence_parallel=sequence_parallel)) for i in range(num_layers): self.descs.append( LayerDesc( TransformerDecoderLayer, d_model=hidden_size, nhead=num_attention_heads, dim_feedforward=ffn_hidden_size, dropout=hidden_dropout_prob, activation=hidden_act, attn_dropout=attention_probs_dropout_prob, act_dropout=hidden_dropout_prob, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=initializer_range)), output_layer_weight_attr=paddle. ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range / math.sqrt(2.0 * num_layers))), bias_attr=None, num_partitions=num_partitions, moe_configs=moe_configs, fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, scale_qk_coeff=num_layers if scale_qk_by_layer_num else 1.0, use_recompute=use_recompute, recompute_granularity=recompute_granularity, sequence_parallel=sequence_parallel, do_recompute=i not in no_recompute_layers, use_flash_attn=use_flash_attn)) self.descs.append( LayerDesc( LayerNormPipe, normalized_shape=hidden_size, sequence_parallel=sequence_parallel, is_last=True)) def _logits_helper(embedding, output): return parallel_matmul(output, embedding.embedding_weight, True) self.descs.append( SharedLayerDesc( 'embed', EmbeddingPipe, forward_func=_logits_helper, shared_weight_attr='embedding_weight', vocab_size=vocab_size, hidden_size=hidden_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=0.02)) recompute_interval = 0 if recompute and recompute_granularity == "full": assert pp_recompute_interval <= \ num_layers // (virtual_pp_degree * env.get_hcg().topology().get_dim_size("pipe")), \ "pp recompute interval should smaller than num layers of each pp chunk" recompute_interval = pp_recompute_interval seg_method = "layer:TransformerDecoderLayer" if num_layers % env.get_hcg().topology().get_dim_size("pipe") != 0: seg_method = "uniform" super().__init__( layers=self.descs, loss_fn=GPTPretrainingCriterionPipe( sequence_parallel=sequence_parallel), topology=env.get_hcg().topology(), seg_method=seg_method, recompute_interval=recompute_interval, recompute_ctx={ "mp_group": env.get_hcg().get_model_parallel_group(), "offload": False, "partition": False, }, num_virtual_pipeline_stages=virtual_pp_degree) class GPTForGenerationHybrid(nn.Layer): """ GPT Model with pretraining tasks on top. Args: gpt (:class:`GPTModel`): An instance of :class:`GPTModel`. """ def __init__(self, gpt, configs): super(GPTForGenerationHybrid, self).__init__() self.gpt = gpt # extra_parameters using for sharding stage3 to register extra_parameters self.extra_parameters = [ get_attr(self.gpt.embeddings.word_embeddings, "weight") ] self.configs = configs self.max_length = self.configs.get('max_dec_len', 20) self.min_length = self.configs.get('min_dec_len', 0) self.decode_strategy = self.configs.get('decode_strategy', 'sampling') self.temperature = self.configs.get('temperature', 1.0) self.top_k = self.configs.get('top_k', 0) self.top_p = self.configs.get('top_p', 1.0) self.repetition_penalty = self.configs.get('repetition_penalty', 1.0) self.num_beams = self.configs.get('num_beams', 1) self.num_beam_groups = self.configs.get('num_beam_groups', 1) self.length_penalty = self.configs.get('length_penalty', 0.0) self.early_stopping = self.configs.get('early_stopping', False) self.bos_token_id = self.configs.get('bos_token_id', None) self.eos_token_id = self.configs.get('eos_token_id', None) self.pad_token_id = self.configs.get('pad_token_id', None) self.decoder_start_token_id = self.configs.get( 'decoder_start_token_id', None) self.forced_bos_token_id = self.configs.get('forced_bos_token_id', None) self.forced_eos_token_id = self.configs.get('forced_eos_token_id', None) self.num_return_sequences = self.configs.get('num_return_sequences', 1) self.diversity_rate = self.configs.get('diversity_rate', 0.0) self.use_cache = self.configs.get('use_cache', True) def prepare_input_ids_for_generation(self, bos_token_id, encoder_output=None): batch_size = 1 if bos_token_id is None: raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.") if encoder_output is not None: batch_size = encoder_output.shape[0] return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id): is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any( input_ids == pad_token_id).numpy().item() is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( (eos_token_id is not None) and (pad_token_id != eos_token_id)) if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: attention_mask = (input_ids == pad_token_id ).astype(paddle.get_default_dtype()) * -1e9 else: attention_mask = paddle.zeros_like( input_ids, dtype=paddle.get_default_dtype()) return paddle.unsqueeze(attention_mask, axis=[1, 2]) def update_scores_for_generation(self, scores, next_scores, length, unfinished_flag): # update scores unfinished_scores = (scores * length + next_scores) / (length + 1) scores = paddle.where(unfinished_flag, unfinished_scores, scores) return scores def get_logits_processor(self, min_length=None, max_length=None, eos_token_id=None, forced_bos_token_id=None, forced_eos_token_id=None, num_beams=1, num_beam_groups=1, diversity_rate=0.0, repetition_penalty=None): processors = LogitsProcessorList() if min_length is not None and eos_token_id is not None and min_length > -1: processors.append( MinLengthLogitsProcessor(min_length, eos_token_id)) if num_beam_groups > 1 and diversity_rate > 0.0: processors.append( HammingDiversityLogitsProcessor( diversity_rate=diversity_rate, num_beams=num_beams, num_beam_groups=num_beam_groups)) if repetition_penalty is not None and repetition_penalty != 1.0: processors.append( RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) if forced_bos_token_id is not None: processors.append( ForcedBOSTokenLogitsProcessor(forced_bos_token_id)) if forced_eos_token_id is not None: processors.append( ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)) # TODO # Add more pre_processing for distribution return processors def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): index = paddle.tile( paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) if attention_mask is not None: model_kwargs["attention_mask"] = paddle.gather(attention_mask, index) if "token_type_ids" in model_kwargs and model_kwargs[ "token_type_ids"] is not None: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.gather(token_type_ids, index) if "position_ids" in model_kwargs and model_kwargs[ "position_ids"] is not None: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = paddle.gather(position_ids, index) if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None: seq_len = model_kwargs["seq_len"] model_kwargs["seq_len"] = paddle.gather(seq_len, index) if "encoder_output" in model_kwargs and model_kwargs[ "encoder_output"] is not None: encoder_output = model_kwargs["encoder_output"] model_kwargs["encoder_output"] = paddle.gather(encoder_output, index) if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.gather(role_ids, index) return input_ids, model_kwargs def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs): # only last token for inputs_ids if cache is defined in kwargs position_ids = kwargs.get("position_ids", None) attention_mask = kwargs.get("attention_mask", None) if attention_mask is not None: if len(attention_mask.shape) == 4: attention_mask = attention_mask[:, -1, -1, :] if "int" in paddle.common_ops_import.convert_dtype( attention_mask.dtype): attention_mask = (1.0 - attention_mask) * -1e4 if cache is not None: input_ids = input_ids[:, -1].unsqueeze(-1) if position_ids is not None: position_ids = position_ids[:, -1].unsqueeze(-1) return { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask, "cache": cache } def update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder=False): # Update the model inputs during generation. # Note that If `token_type_ids` and `attention_mask` in `model_kwargs` # and they contain pad value, the result vectors updated by this method # may be different from expected. In this case, you need to rewrite the # method. # update cache if isinstance(outputs, tuple): model_kwargs["cache"] = outputs[1] # update token_type_ids with last value if "token_type_ids" in model_kwargs and model_kwargs[ "token_type_ids"] is not None: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.concat( [token_type_ids, token_type_ids[:, -1:]], axis=-1) # update position_ids if "position_ids" in model_kwargs and model_kwargs[ "position_ids"] is not None: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = paddle.concat( [position_ids, position_ids[:, -1:] + 1], axis=-1) # update attention_mask if not is_encoder_decoder and "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] # nn.Pad2D don't support the data type `bool` if convert_dtype(attention_mask.dtype) == 'bool': attention_mask = paddle.cast(attention_mask, 'int64') if len(attention_mask.shape) == 4: attention_mask = nn.Pad2D( [0, 0, 0, 1], mode='replicate')(attention_mask) attention_mask = nn.Pad2D( [0, 1, 0, 0], value=-1e4)(attention_mask) dtype = convert_dtype(attention_mask.dtype) if 'int' in dtype: attention_mask[:, :, -1, -1] = 1 elif 'float' in dtype: attention_mask[:, :, -1, -1] = 0.0 else: raise ValueError( 'The data type of input `attention_mask` must ' 'be bool, int or float') else: attention_mask = paddle.concat( [ attention_mask, paddle.ones( [attention_mask.shape[0], 1], dtype="int64") ], axis=-1) model_kwargs["attention_mask"] = attention_mask # update role_ids if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.concat( [role_ids, role_ids[:, -1:]], axis=-1) return model_kwargs def sample(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, top_k=None, top_p=None, temperature=None, min_tokens_to_keep=1, **model_kwargs): def TopKProcess(probs, top_k, min_tokens_to_keep): top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1]) # Remove all tokens with a probability less than the last token of the top-k topk_probs, _ = paddle.topk(probs, k=top_k) probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0)) return probs def TopPProcess(probs, top_p, min_tokens_to_keep): sorted_probs = paddle.sort(probs, descending=True) sorted_indices = paddle.argsort(probs, descending=True) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) # Remove tokens with cumulative probs above the top_p, But keep at # least min_tokens_to_keep tokens sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Set 'min_tokens_to_keep - 1' because the first token is kept sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0 # Keep the first token sorted_indices_to_remove = paddle.cast( sorted_indices_to_remove, dtype='int64') sorted_indices_to_remove[:, 1:] = ( sorted_indices_to_remove[:, :-1].clone()) sorted_indices_to_remove[:, 0] = 0 # Scatter sorted tensors to original indexing sorted_indices = sorted_indices + paddle.arange(probs.shape[ 0]).unsqueeze(-1) * probs.shape[-1] condition = paddle.scatter(sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()) condition = paddle.cast(condition, 'bool').reshape(probs.shape) probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs batch_size, cur_len = input_ids.shape origin_len = input_ids.shape[1] unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool') scores = paddle.full( [batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) # use_cache is immutable, we split it off other mutable kwargs. assert 'use_cache' in model_kwargs immutable = {'use_cache': model_kwargs['use_cache']} del model_kwargs['use_cache'] def _forward_(**args): model_inputs = self.prepare_inputs_for_generation( input_ids, **args, **immutable) return self.gpt(**model_inputs, **immutable) def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs): logits = outputs[0] if isinstance(outputs, tuple) else outputs logits = parallel_matmul( logits, get_attr(self.gpt.embeddings.word_embeddings, "weight"), False) # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = logits_processors(input_ids, logits) # sample origin_probs = F.softmax(logits) origin_probs = paddle.log(origin_probs) if temperature is not None and temperature != 1.0: logits = logits / temperature probs = F.softmax(logits) if top_k is not None and top_k != 0: probs = TopKProcess(probs, top_k, min_tokens_to_keep) if top_p is not None and top_p < 1.0: probs = TopPProcess(probs, top_p, min_tokens_to_keep) next_tokens = paddle.multinomial(probs) next_scores = paddle.index_sample(origin_probs, next_tokens) if eos_token_id is not None: next_tokens = paddle.where( unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) scores = self.update_scores_for_generation( scores, next_scores, cur_len - origin_len, unfinished_flag) input_ids = paddle.concat([input_ids, next_tokens], axis=1) if eos_token_id is not None: unfinished_flag = paddle.logical_and( unfinished_flag, next_tokens != eos_token_id) model_kwargs = self.update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder) return input_ids, scores, unfinished_flag, model_kwargs # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement # the value in model_kwargs should be tensor before while loop outputs = _forward_(**model_kwargs) input_ids, scores, unfinished_flag, model_kwargs = _post_process_( outputs, input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs) cur_len += 1 attn_mask = model_kwargs['attention_mask'] # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static. model_kwargs['attention_mask'] = paddle.reshape( attn_mask, paddle.shape(attn_mask)) model_kwargs['cache'] = outputs[1] if isinstance(outputs, tuple) else None while cur_len < max_length: # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs) # and change it to pass directly to _post_process_ to avoid # closed-loop problem of dynamic-to-static model input_ids, scores, unfinished_flag, model_kwargs = _post_process_( _forward_(**model_kwargs), input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs) cur_len += 1 if not paddle.any(unfinished_flag): break return input_ids[:, origin_len:], scores def forward(self, input_ids=None, **model_kwargs): max_length = self.max_length min_length = self.min_length decode_strategy = self.decode_strategy temperature = self.temperature top_k = self.top_k top_p = self.top_p repetition_penalty = self.repetition_penalty num_beams = self.num_beams num_beam_groups = self.num_beam_groups length_penalty = self.length_penalty early_stopping = self.early_stopping bos_token_id = self.bos_token_id eos_token_id = self.eos_token_id pad_token_id = self.pad_token_id decoder_start_token_id = self.decoder_start_token_id forced_bos_token_id = self.forced_bos_token_id forced_eos_token_id = self.forced_eos_token_id num_return_sequences = self.num_return_sequences diversity_rate = self.diversity_rate use_cache = self.use_cache assert ( decode_strategy in ["greedy_search", "sampling", "beam_search"] ), "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format( decode_strategy) bos_token_id = bos_token_id if bos_token_id is not None else getattr( self.gpt, 'bos_token_id', None) eos_token_id = eos_token_id if eos_token_id is not None else getattr( self.gpt, 'eos_token_id', None) pad_token_id = pad_token_id if pad_token_id is not None else getattr( self.gpt, 'pad_token_id', None) forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr( self.gpt, 'forced_bos_token_id', None) forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr( self.gpt, 'forced_eos_token_id', None) decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr( self.gpt, 'decoder_start_token_id', None) # params check if input_ids is None: # Init `input_ids` with bos_token_id input_ids = self.prepare_input_ids_for_generation(bos_token_id) if model_kwargs.get("attention_mask", None) is None: # TODO # Init `attention_mask` depending on `pad_token_id` model_kwargs[ "attention_mask"] = self.prepare_attention_mask_for_generation( input_ids, pad_token_id, eos_token_id) self.is_encoder_decoder = False model_kwargs["use_cache"] = use_cache max_length += input_ids.shape[-1] min_length += input_ids.shape[-1] logits_processors = self.get_logits_processor( min_length=min_length, max_length=max_length, eos_token_id=eos_token_id, forced_bos_token_id=forced_bos_token_id, forced_eos_token_id=forced_eos_token_id, num_beams=num_beams, num_beam_groups=num_beam_groups, diversity_rate=diversity_rate, repetition_penalty=repetition_penalty) if decode_strategy == 'sampling': if num_return_sequences > 1: input_ids, model_kwargs = self.expand_inputs_for_generation( input_ids, expand_size=num_return_sequences, **model_kwargs) ret = self.sample(input_ids, logits_processors, max_length, pad_token_id, eos_token_id, top_k, top_p, temperature, **model_kwargs) else: raise ValueError(f'Not support {decoding_strategy} strategy yet!') return ret def get_triangle_upper_mask(x, mask): if mask is not None: return mask mask = paddle.full_like(x, -np.inf) mask.stop_gradient = True mask = paddle.triu(mask, diagonal=1) mask.stop_gradient = True return mask class ConcatSoftmaxInput(PyLayer): @staticmethod def forward(ctx, inp, group=None): inputs = [] paddle.distributed.all_gather(inputs, inp, group=group) with paddle.no_grad(): cat = paddle.concat(inputs, axis=-1) ctx.cat_args = group return cat @staticmethod def backward(ctx, grad): group = ctx.cat_args with paddle.no_grad(): grads = paddle.split( grad, paddle.distributed.get_world_size(group), axis=-1) grad = grads[paddle.distributed.get_rank(group)] return grad ================================================ FILE: ppfleetx/models/language_model/gpt/dygraph/processor.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List import inspect from abc import ABC import paddle class LogitsProcessorList(List): def __call__(self, input_ids, logits, **kwargs): for processor in self: processor_args = inspect.signature(processor.__call__).parameters if len(processor_args) > 2: assert all( arg in kwargs for arg in list(processor_args.keys())[2:] ), f"The parameters don't match for {processor.__class__}" logits = processor(input_ids, logits, **kwargs) else: logits = processor(input_ids, logits) return logits class LogitsProcessor(ABC): """ Abstract base class for all logit processors that can be applied during generation. """ def __call__(self, input_ids, logits): raise NotImplementedError( f"{self.__class__} is an abstract class. " "Only classes inheriting this class can be called.") class MinLengthLogitsProcessor(LogitsProcessor): r""" Enforcing a min-length by setting EOS probability to 0. Args: min_length (int): The minimum length of generation sequence. eos_token_id (int): The id of the `end-of-sequence` token. """ def __init__(self, min_length, eos_token_id): if not isinstance(min_length, int) or min_length < 0: raise ValueError( "`min_length` should be a positive integer, but get {}".format( min_length)) if not isinstance(eos_token_id, int) or eos_token_id < 0: raise ValueError( "`eos_token_id` should be a positive integer, but get {}". format(eos_token_id)) self.min_length = min_length self.eos_token_id = eos_token_id def __call__(self, input_ids, logits): cur_len = input_ids.shape[-1] if cur_len < self.min_length: logits[:, self.eos_token_id] = -float("inf") return logits class RepetitionPenaltyLogitsProcessor(LogitsProcessor): r""" Enforcing an exponential penalty on repeated sequences. Args: repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty. See `this paper `__ for more details. """ def __init__(self, penalty: float): if not isinstance(penalty, float) or not (penalty > 0): raise ValueError( f"`penalty` has to be a strictly positive float, but is {penalty}" ) self.penalty = penalty def __call__(self, input_ids, logits): score = paddle.index_sample(logits, input_ids) score = paddle.where(score < 0, score * self.penalty, score / self.penalty) input_ids = input_ids + paddle.arange(logits.shape[0]).unsqueeze( -1) * logits.shape[-1] outputs = paddle.scatter(logits.flatten(), input_ids.flatten(), score.flatten()).reshape(logits.shape) return outputs class HammingDiversityLogitsProcessor(LogitsProcessor): """ This `LogitsProcessor` enforces diverse beam search. Note that this logits processor is only effective for `group_beam_search`. See `this paper `__ for more details. Args: diversity_rate (float): This value is subtracted from a beam's score if it generates a token same as any beam from other group at a particular time. num_beams (int): Number of beams used for group beam search. num_beam_groups (int): Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. """ def __init__(self, diversity_rate, num_beams, num_beam_groups): if not isinstance(diversity_rate, float) or (not diversity_rate > 0.0): raise ValueError( "`diversity_rate` should be a float strictly larger than 0.") self._diversity_rate = diversity_rate if not isinstance(num_beams, int) or num_beams < 2: raise ValueError( "`num_beams` should be an integer strictly larger than 1.") self._num_beams = num_beams if not isinstance(num_beam_groups, int) or num_beam_groups < 2: raise ValueError( "`num_beam_groups` should be an integer strictly larger than 1." ) self._num_sub_beams = num_beams // num_beam_groups def __call__(self, input_ids, scores, current_tokens, beam_group_idx): batch_size = current_tokens.shape[0] // self._num_beams group_start_idx = beam_group_idx * self._num_sub_beams group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams) group_size = group_end_idx - group_start_idx vocab_size = scores.shape[-1] if group_start_idx == 0: return scores for batch_idx in range(batch_size): previous_group_tokens = current_tokens[ batch_idx * self._num_beams:batch_idx * self._num_beams + group_start_idx] token_frequency = paddle.bincount( previous_group_tokens, minlength=vocab_size) scores[batch_idx * group_size:(batch_idx + 1) * group_size] -= self._diversity_rate * token_frequency return scores class ForcedBOSTokenLogitsProcessor(LogitsProcessor): """ This `LogitsProcessor` enforces the first generated token to be the selected `forced_bos_token`. Args: forced_bos_token_id (:obj:`int`): The id of the token to to be generated as the first token. """ def __init__(self, forced_bos_token_id): self.forced_bos_token_id = forced_bos_token_id def __call__(self, input_ids, scores): cur_len = input_ids.shape[-1] if cur_len == 1: num_tokens = scores.shape[1] scores[:, [ i for i in range(num_tokens) if i != self.forced_bos_token_id ]] = -float("inf") scores[:, self.forced_bos_token_id] = 0 return scores class ForcedEOSTokenLogitsProcessor(LogitsProcessor): """ This `LogitsProcessor` enforces the last generated token to be the selected `forced_eos_token`. Args: max_length (int): The maximum length of the sequence to be generated. forced_eos_token_id (int): The id of the token to to be generated as the last token. """ def __init__(self, max_length, forced_eos_token_id): self.max_length = max_length self.forced_eos_token_id = forced_eos_token_id def __call__(self, input_ids, scores): cur_len = input_ids.shape[-1] if cur_len == self.max_length - 1: num_tokens = scores.shape[1] scores[:, [ i for i in range(num_tokens) if i != self.forced_eos_token_id ]] = -1e9 #TODO change back to -inf after paddle.topk is fixed scores[:, self.forced_eos_token_id] = 0 return scores ================================================ FILE: ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle from paddle import framework from paddle import distributed as dist from paddle.nn import functional as F from paddle.autograd import PyLayer from paddle.fluid import core from paddle.nn.layer.layers import Layer from paddle.distributed import fleet from paddle.distributed.fleet.base import topology as tp from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients_with_group from ppfleetx.distributed.apis import env import numpy as np #################################################### # # # Distributed Communication Operator # # # #################################################### def scatter(input): hcg = env.get_hcg() group = hcg.get_model_parallel_group() parallelism = group.nranks rank = group.rank seq_len = input.shape[0] assert seq_len % parallelism == 0, "Input sequence length {} can't be divided exactly by sequence parallelism {}".format( seq_len, parallelism) interval = seq_len // parallelism input = paddle.slice( input, axes=[0], starts=[interval * rank], ends=[interval * (rank + 1)]) return input def all_gather(input): hcg = env.get_hcg() group = hcg.get_model_parallel_group() parallelism = group.nranks output_shape = input.shape output_shape[0] = output_shape[0] * parallelism output = paddle.empty(shape=output_shape, dtype=input.dtype) group.process_group.all_gather(input, output).wait() return output def reduce_scatter(input): hcg = env.get_hcg() group = hcg.get_model_parallel_group() parallelism = group.nranks output_shape = input.shape assert input.shape[ 0] % parallelism == 0, "Input sequence length {0} can't be divided exactly by sequence parallelism {1}".format( input.shape[0], parallelism) output_shape[0] = output_shape[0] // parallelism output = paddle.empty(shape=output_shape, dtype=input.dtype) dist.stream.reduce_scatter( output, input, op=dist.ReduceOp.SUM, group=group, sync_op=True) return output class ScatterOp(PyLayer): # input shape: [s, b, h], n is mp parallelism # after forward shape: [s/n, b, h] @staticmethod def forward(ctx, input): return scatter(input) @staticmethod def backward(ctx, grad): return all_gather(grad) class GatherOp(PyLayer): # input shape: [s/n, b, h], n is mp parallelism # after forward shape: [s, b, h] @staticmethod def forward(ctx, input): return all_gather(input) @staticmethod def backward(ctx, grad): return scatter(grad) # All gather along the first dim during forward pass # All reduce and scatter along the first dim during backward pass class AllGatherOp(PyLayer): # input shape: [s/n, b, h], n is mp parallelism # after forward shape: [s, b, h] @staticmethod def forward(ctx, input): return all_gather(input) # grad shape: [s, b, h], n is mp parallelism # after forward shape: [s/n, b, h] @staticmethod def backward(ctx, grad): return reduce_scatter(grad) # All reduce and scatter along the first dim during forward pass # All gather along the first dim during backward pass class ReduceScatterOp(PyLayer): # input shape: [s, b, h], n is mp parallelism # after forward shape: [s/n, b, h] @staticmethod def forward(ctx, input): return reduce_scatter(input) # grad shape: [s/n, b, h], n is mp parallelism # after forward shape: [s, b, h] @staticmethod def backward(ctx, grad): return all_gather(grad) ################################################### # # # Modified Parallel Linear Operator # # # ################################################### def mark_as_sequence_parallel_parameter(parameter): setattr(parameter, 'sequence_parallel', True) def is_sequence_parallel_parameter(parameter): return getattr(parameter, 'sequence_parallel', False) def create_fused_allreduce_gradient_hook(parameter_list, accumulation_steps): hcg = env.get_hcg() group = hcg.get_model_parallel_group() step = [0] accumulation_steps *= len(parameter_list) def __impl__(grad): step[0] += 1 if step[0] == accumulation_steps: step[0] = 0 fused_allreduce_gradients_with_group( parameter_list, group=group, scale=1.0) return grad return __impl__ def create_non_fused_allreduce_gradient_hook(param, accumulation_steps): hcg = env.get_hcg() pg = hcg.get_model_parallel_group().process_group step = [0] @paddle.autograd.no_grad() def __impl__(): step[0] += 1 if (step[0] % accumulation_steps) == 0: if hasattr(param, "main_grad"): pg.allreduce(param.main_grad).wait() else: pg.allreduce(param.grad).wait() return __impl__ def register_sequence_parallel_allreduce_hooks( model, accumulation_steps, fuse_sequence_parallel_allreduce): if accumulation_steps <= 0 or not paddle.distributed.is_initialized(): return mp_group = env.get_hcg().get_model_parallel_group() if mp_group.nranks <= 1: return params = [] for p in model.parameters(): if is_sequence_parallel_parameter(p): params.append(p) if fuse_sequence_parallel_allreduce: hook = create_fused_allreduce_gradient_hook(params, accumulation_steps) for p in params: p._register_backward_hook(hook) else: for p in params: hook = create_non_fused_allreduce_gradient_hook(p, accumulation_steps) p._register_backward_hook(hook) def is_fused_matmul_bias_supported(): if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue') else: return False class ColumnSequenceParallelLinear(Layer): def __init__(self, in_features, out_features, weight_attr=None, has_bias=None, gather_output=True, fuse_matmul_bias=False, mp_group=None, name=None): super(ColumnSequenceParallelLinear, self).__init__() hcg = env.get_hcg() self.model_parallel_group = hcg.get_model_parallel_group( ) if mp_group is None else mp_group self.world_size = hcg.get_model_parallel_group( ).nranks if mp_group is None else mp_group.nranks self._name = name self.is_mp = (self.world_size > 1) assert gather_output is False, "If sequence_parallel is True, \ gather_output is False" self.gather_output = gather_output assert out_features % self.world_size == 0, ( "Number of column of the weight for linear ({}) must be" " divisible by model parallel size ({})".format(out_features, self.world_size)) self.output_size_per_partition = out_features // self.world_size self._weight_attr = weight_attr self._dtype = self._helper.get_default_dtype() if self.is_mp and paddle.in_dynamic_mode(): with get_rng_state_tracker().rng_state(): self.weight = self.create_parameter( shape=[in_features, self.output_size_per_partition], attr=self._weight_attr, dtype=self._dtype, is_bias=False) else: self.weight = self.create_parameter( shape=[in_features, self.output_size_per_partition], attr=self._weight_attr, dtype=self._dtype, is_bias=False) self.weight.is_distributed = True if self.is_mp else False if has_bias: # initialize bias to zero like Megatron self.bias = self.create_parameter( shape=[self.output_size_per_partition], attr=paddle.nn.initializer.Constant(value=0.0), dtype=self._dtype, is_bias=True) self.bias.is_distributed = True if self.is_mp else False else: self.bias = None self.linear = F.linear if fuse_matmul_bias: if not is_fused_matmul_bias_supported(): raise NotImplementedError( "You set fuse_matmul_bias=True in ColumnSequenceParallelLinear, " "however, the paddle you are using not support this operation. " "Please set fuse_matmul_bias=False or use paddle compiled " "with cuda 11.6 or higher.") from paddle.incubate.nn.functional import fused_linear self.linear = fused_linear def forward(self, x): # sequence parallelism is same as model parallelism # if sequence parallel is true, input shape is [s, b, h] # else input shape is [b, s, h] if self.is_mp: input_parallel = AllGatherOp.apply(x) else: input_parallel = x output = self.linear( input_parallel, self.weight, self.bias, name=self._name) return output class RowSequenceParallelLinear(Layer): def __init__(self, in_features, out_features, weight_attr=None, has_bias=True, input_is_parallel=False, fuse_matmul_bias=False, mp_group=None, name=None): super(RowSequenceParallelLinear, self).__init__() self.in_features = in_features self.out_features = out_features assert input_is_parallel is True, "If sequence_parallel is True, \ input_is_parallel should be true." self.input_is_parallel = input_is_parallel self._weight_attr = weight_attr self._dtype = self._helper.get_default_dtype() self._name = name hcg = env.get_hcg() self.model_parallel_group = hcg.get_model_parallel_group( ) if mp_group is None else mp_group self.world_size = hcg.get_model_parallel_group( ).nranks if mp_group is None else mp_group.nranks self.rank = hcg.get_model_parallel_group( ).rank if mp_group is None else mp_group.rank self.is_mp = (self.world_size > 1) assert in_features % self.world_size == 0, ( "Number of row of the weight for linear ({}) must be" " divisible by model parallel size ({})".format(in_features, self.world_size)) self.input_size_per_partition = in_features // self.world_size if self.is_mp and paddle.in_dynamic_mode(): with get_rng_state_tracker().rng_state(): self.weight = self.create_parameter( shape=[self.input_size_per_partition, self.out_features], attr=self._weight_attr, dtype=self._dtype, is_bias=False) else: self.weight = self.create_parameter( shape=[self.input_size_per_partition, self.out_features], attr=self._weight_attr, dtype=self._dtype, is_bias=False) self.weight.is_distributed = True if self.is_mp else False # if sequence parallel is true, # register hook to all_reduce gradient of weight and bias if has_bias: self.bias = self.create_parameter( shape=[self.out_features], attr=paddle.nn.initializer.Constant(value=0.0), dtype=self._dtype, is_bias=True) if self.is_mp: mark_as_sequence_parallel_parameter(self.bias) else: self.bias = None self.linear = F.linear if fuse_matmul_bias: if not is_fused_matmul_bias_supported(): raise NotImplementedError( "You set fuse_matmul_bias=True in RowParallelLinear, " "however, the paddle you are using not support this operation. " "Please set fuse_matmul_bias=False or use paddle compiled " "with cuda 11.6 or higher.") from paddle.incubate.nn.functional import fused_linear self.linear = fused_linear def forward(self, x): input_parallel = x if self.is_mp: output_parallel = self.linear( input_parallel, self.weight, name=self._name) output_ = ReduceScatterOp.apply(output_parallel) # if self.bias is not none, sequence parallel will use # register_hook to all_reduce self.bias output = output_ + self.bias if self.bias is not None else output_ else: output = self.linear( input_parallel, self.weight, self.bias, name=self._name) return output ================================================ FILE: ppfleetx/models/language_model/gpt/dygraph/single_model.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import logging from distutils.util import strtobool import os import numpy as np import math import paddle import paddle.nn as nn import paddle.nn.functional as F import paddle.tensor as tensor from paddle.fluid import layers from paddle.nn.layer.transformer import _convert_param_attr_to_list from paddle.common_ops_import import convert_dtype import paddle.incubate as incubate from paddle.distributed.fleet.utils import recompute from paddle.incubate.nn import FusedLinear from .processor import ( LogitsProcessorList, MinLengthLogitsProcessor, HammingDiversityLogitsProcessor, RepetitionPenaltyLogitsProcessor, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor) from ppfleetx.models.language_model.moe import MoELayer from ppfleetx.models.language_model.moe_exp.layer import MoE from ppfleetx.utils.log import logger try: from paddle.nn.functional.flash_attention import flash_attention except: flash_attention = None def get_attr(layer, name): if getattr(layer, name, None) is not None: return getattr(layer, name, None) else: return get_attr(layer._layer, name) class ExpertLayer(nn.Layer): def __init__(self, d_model, d_hidden, name=None): super(ExpertLayer, self).__init__() self.htoh4 = nn.Linear( d_model, d_hidden, weight_attr=nn.initializer.KaimingUniform(), bias_attr=nn.initializer.Constant(value=0.0)) self.h4toh = nn.Linear( d_hidden, d_model, weight_attr=nn.initializer.KaimingUniform(), bias_attr=nn.initializer.Constant(value=0.0)) self.htoh4.weight.name = "expert_" + self.htoh4.weight.name self.h4toh.weight.name = "expert_" + self.h4toh.weight.name self.htoh4.bias.name = "expert_" + self.htoh4.bias.name self.h4toh.bias.name = "expert_" + self.h4toh.bias.name def forward(self, x): x = self.htoh4(x) x = F.gelu(x, approximate=True) x = self.h4toh(x) return x class MultiHeadAttention(nn.Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. """ Cache = collections.namedtuple("Cache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None, output_layer_weight_attr=None, fuse_attn_qkv=False, scale_qk_coeff=1.0, fused_linear=False, use_recompute=False, recompute_granularity="full", do_recompute=True, use_flash_attn=False): super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.fuse_attn_qkv = fuse_attn_qkv self.scale_qk_coeff = scale_qk_coeff self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.do_recompute = do_recompute self.use_flash_attn = use_flash_attn if flash_attention else None self.head_dim = embed_dim // num_heads assert self.head_dim * \ num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" Linear = FusedLinear if fused_linear else nn.Linear if self.fuse_attn_qkv: assert self.kdim == embed_dim assert self.vdim == embed_dim self.qkv_proj = Linear( embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr) else: self.q_proj = Linear( embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) self.k_proj = Linear( self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) self.v_proj = Linear( self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) self.out_proj = Linear( embed_dim, embed_dim, output_layer_weight_attr, bias_attr=bias_attr) def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): mix_layer = self.qkv_proj(query) mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim]) q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) assert not isinstance( cache, self.StaticCache ), "cache currently does not support the StaticCache type" if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=1) v = tensor.concat([cache.v, v], axis=1) if use_cache is True: cache = self.Cache(k, v) return (q, k, v, cache) if use_cache else (q, k, v, None) def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): r""" Prapares linear projected queries, keys and values for usage of subsequnt multiple parallel attention. If `cache` is not None, using cached results to reduce redundant calculations. """ q = self.q_proj(query) q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim]) if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference k = tensor.concat([cache.k, k], axis=1) v = tensor.concat([cache.v, v], axis=1) if use_cache is True: cache = self.Cache(k, v) return (q, k, v, cache) if use_cache else (q, k, v, None) def compute_kv(self, key, value): r""" Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation subspaces. The results are used as key-values pairs for subsequent multiple parallel attention. It is part of calculations in multi-head attention, and is provided as a method to pre-compute and prefetch these results, thus we can use them to construct cache for inference. """ k = self.k_proj(key) v = self.v_proj(value) k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim]) v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim]) return k, v def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) def _flash_attention(self, q, k, v, attn_mask=None): out, weights = flash_attention( q, k, v, self.dropout, causal=True, return_softmax=self.need_weights) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) return out, weights def core_attn(self, q, k, v, attn_mask=None): perm = [0, 2, 1, 3] q = tensor.transpose(x=q, perm=perm) k = tensor.transpose(x=k, perm=perm) v = tensor.transpose(x=v, perm=perm) # scale dot product attention scale_qk_coeff = self.scale_qk_coeff * self.head_dim**0.5 product = paddle.matmul( x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True) if self.scale_qk_coeff != 1.0: product = product.scale(self.scale_qk_coeff) if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) else: weights = incubate.softmax_mask_fuse_upper_triangle(product) if self.dropout: weights = F.dropout( weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, -1]) return out, weights def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if self.fuse_attn_qkv: q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) if self.use_recompute and self.recompute_granularity == "core_attn" and self.do_recompute: out, weights = recompute(self.core_attn, q, k, v, attn_mask) elif self.use_flash_attn and attn_mask is None: out, weights = self._flash_attention(q, k, v) else: out, weights = self.core_attn(q, k, v, attn_mask=attn_mask) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs) class TransformerDecoder(nn.Layer): """ TransformerDecoder is a stack of N decoder layers. """ def __init__(self, decoder_layers, num_layers, norm=None, hidden_size=None, use_recompute=False, recompute_granularity="full", no_recompute_layers=None): super(TransformerDecoder, self).__init__() if no_recompute_layers is None: no_recompute_layers = [] self.no_recompute_layers = no_recompute_layers self.num_layers = num_layers self.layers = decoder_layers self.norm = norm self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity if norm == "LayerNorm": self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5) elif norm is not None: raise ValueError("Only support LayerNorm") def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, use_cache=False, cache=None): r""" Applies a stack of N Transformer decoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last decoder layer. """ output = tgt new_caches = [] for i, mod in enumerate(self.layers): if cache is None: if use_cache: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache) new_caches.append(new_cache) else: if self.use_recompute and self.recompute_granularity == "full" and i not in self.no_recompute_layers: output = recompute(mod, output, memory, tgt_mask, use_cache, cache) else: output = mod(output, memory, tgt_mask, use_cache, cache) else: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache[i]) new_caches.append(new_cache) if self.norm is not None: output = self.norm(output) return output if use_cache is False else (output, new_caches) def gen_cache(self, memory, do_zip=False): r""" Generates cache for `forward` usage. The generated cache is a list, and each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` for more details. If `do_zip` is True, apply `zip` on these tuples to get a list with two elements. """ cache = [layer.gen_cache(memory) for layer in self.layers] if do_zip: cache = list(zip(*cache)) return cache class TransformerDecoderLayer(nn.Layer): """ The transformer decoder layer. It contains multiheadattention and some linear layers. """ def __init__(self, d_model, nhead, dim_feedforward, num_experts=1, dropout=0.1, activation="gelu", attn_dropout=None, act_dropout=None, normalize_before=True, topk=1, moe_use_residual=False, moe_train_capacity_factor=1.0, moe_eval_capacity_factor=1.0, moe_min_capacity=4, moe_token_dropping=True, enable_expert_tensor_parallelism=False, weight_attr=None, bias_attr=None, output_layer_weight_attr=None, fused_linear=False, fuse_attn_qkv=False, scale_qk_coeff=1.0, use_recompute=False, recompute_granularity="full", do_recompute=True, skip_quant_tensors=[], use_flash_attn=False): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before self.use_recompute = use_recompute self.recompute_granularity = recompute_granularity self.do_recompute = do_recompute self.num_experts = num_experts weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) output_layer_weight_attrs = _convert_param_attr_to_list( output_layer_weight_attr, 3) Linear = FusedLinear if fused_linear else nn.Linear self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], output_layer_weight_attr=output_layer_weight_attrs[0], fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, scale_qk_coeff=scale_qk_coeff, use_recompute=use_recompute, recompute_granularity=recompute_granularity, do_recompute=do_recompute, use_flash_attn=use_flash_attn) self.moe_mlp = None if self.num_experts > 1: assert (topk == 1, "Only support topk=1 currently.") self.moe_mlp = MoE( d_model, ExpertLayer(d_model, dim_feedforward), self.num_experts, ep_size=1, k=topk, use_residual=moe_use_residual, capacity_factor=moe_train_capacity_factor, eval_capacity_factor=moe_eval_capacity_factor, min_capacity=moe_min_capacity, drop_tokens=moe_token_dropping, enable_expert_tensor_parallelism=enable_expert_tensor_parallelism ) else: self.linear1 = Linear( d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.linear2 = Linear( dim_feedforward, d_model, output_layer_weight_attrs[2], bias_attr=bias_attrs[2]) if 'linear1' in skip_quant_tensors: self.linear1.skip_quant = True if 'linear2' in skip_quant_tensors: self.linear2.skip_quant = True self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") if activation == 'gelu': self.activation = nn.GELU(approximate=True) else: self.activation = getattr(F, activation) def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if use_cache is False: if self.use_recompute and self.recompute_granularity == "full_attn" and self.do_recompute: tgt = recompute(self.self_attn, tgt, None, None, tgt_mask, use_cache, cache) else: tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) residual = tgt if self.normalize_before: tgt = self.norm2(tgt) # if self.expert_mode: # tgt = self.moe_mlp(tgt) if self.num_experts > 1: tgt = self.moe_mlp(tgt) else: tgt = self.dropout2( self.linear2(self.activation(self.linear1(tgt)))) tgt = residual + tgt if not self.normalize_before: tgt = self.norm2(tgt) return tgt if use_cache is False else (tgt, incremental_cache) def gen_cache(self, memory): incremental_cache = self.self_attn.gen_cache( memory, type=self.self_attn.Cache) return incremental_cache class GPTEmbeddings(nn.Layer): """ Include embeddings from word and position embeddings. """ def __init__(self, vocab_size, hidden_size=768, hidden_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, freeze_embedding=False): super(GPTEmbeddings, self).__init__() self.word_embeddings = nn.Embedding( vocab_size, hidden_size, weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range))) self.position_embeddings = nn.Embedding( max_position_embeddings, hidden_size, weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( mean=0.0, std=initializer_range))) if freeze_embedding: self.word_embeddings.weight.learning_rate = 0.0 self.position_embeddings.weight.learning_rate = 0.0 self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, position_ids=None): if position_ids is None: ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones input_embedings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) embeddings = input_embedings + position_embeddings embeddings = self.dropout(embeddings) return embeddings class GPTModel(nn.Layer): def __init__(self, vocab_size=51200, hidden_size=768, num_layers=12, num_attention_heads=12, ffn_hidden_size=3072, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, use_recompute=False, initializer_range=0.02, num_experts=[1], expert_interval=2, topk=1, moe_use_residual=False, moe_train_capacity_factor=1.0, moe_eval_capacity_factor=1.0, moe_min_capacity=4, moe_token_dropping=True, enable_expert_tensor_parallelism=False, fused_linear=False, fuse_attn_qkv=False, scale_qk_by_layer_num=True, recompute_granularity="full", sequence_parallel=False, no_recompute_layers=None, skip_tensor_map={}, freeze_embedding=False, use_flash_attn=False, fused_softmax_with_triangular=False): super(GPTModel, self).__init__() if no_recompute_layers is None: no_recompute_layers = [] self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size self.fused_softmax_with_triangular = fused_softmax_with_triangular if use_flash_attn: if flash_attention: logger.info("Flash-attention enabled.") else: use_flash_attn = False logger.warning( "Flash-attention is not support in this Paddle version.") self.embeddings = GPTEmbeddings( vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, type_vocab_size, self.initializer_range, freeze_embedding) assert len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval, \ 'num_experts must be either a single value or a list of the same length as the number of MoE layers' # Expand the list of MoE experts num to MoE layers num if len(num_experts) == 1: num_experts = num_experts * (num_layers // expert_interval) decoder_layers = nn.LayerList() for i in range(num_layers): # TODO: original layer_num = i + 1 + offset here layer_num = i + 1 if layer_num % expert_interval == 0: n_e = num_experts[(layer_num - 1) // expert_interval] else: n_e = 1 decoder_layers.append( TransformerDecoderLayer( d_model=hidden_size, nhead=num_attention_heads, dim_feedforward=ffn_hidden_size, num_experts=n_e, dropout=hidden_dropout_prob, activation="gelu", attn_dropout=attention_probs_dropout_prob, act_dropout=hidden_dropout_prob, topk=topk, moe_use_residual=moe_use_residual, moe_train_capacity_factor=moe_train_capacity_factor, moe_eval_capacity_factor=moe_eval_capacity_factor, moe_min_capacity=moe_min_capacity, moe_token_dropping=moe_token_dropping, enable_expert_tensor_parallelism=enable_expert_tensor_parallelism, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range)), output_layer_weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range / math.sqrt( 2.0 * num_layers))), bias_attr=None, fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, scale_qk_coeff=num_layers if scale_qk_by_layer_num else 1.0, use_recompute=use_recompute, recompute_granularity=recompute_granularity, do_recompute=i not in no_recompute_layers, skip_quant_tensors=skip_tensor_map.get('block_{}'.format( i), []), use_flash_attn=use_flash_attn)) self.decoder = TransformerDecoder( decoder_layers, num_layers, norm="LayerNorm", hidden_size=hidden_size, use_recompute=use_recompute, recompute_granularity=recompute_granularity, no_recompute_layers=no_recompute_layers) def forward(self, input_ids, position_ids=None, attention_mask=None, use_cache=False, cache=None): if position_ids is None: past_length = 0 if cache is not None: past_length = paddle.shape(attention_mask)[-1] - 1 position_ids = paddle.arange( past_length, paddle.shape(input_ids)[-1] + past_length, dtype=input_ids.dtype) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( ): # TODO, use registered buffer causal_mask = paddle.tensor.triu( paddle.ones( (paddle.shape(input_ids)[-1], paddle.shape(input_ids)[-1])) * -1e4, diagonal=1) if attention_mask is not None: if len(attention_mask.shape) == 2: attention_mask = attention_mask[:, None, None, :] attention_mask = attention_mask + causal_mask else: attention_mask = causal_mask # The tensor returned by triu not in static graph. attention_mask.stop_gradient = True encoder_outputs = self.decoder( embedding_output, memory=None, tgt_mask=None if (self.fused_softmax_with_triangular and self.training and paddle.is_compiled_with_cuda()) else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, cache=cache) return encoder_outputs class GPTForPretraining(nn.Layer): """ GPT Model with pretraining tasks on top. Args: gpt (:class:`GPTModel`): An instance of :class:`GPTModel`. """ def __init__(self, gpt): super(GPTForPretraining, self).__init__() self.gpt = gpt def forward(self, input_ids, position_ids=None, attention_mask=None, masked_positions=None, use_cache=False, cache=None): outputs = self.gpt(input_ids, position_ids=position_ids, attention_mask=attention_mask, use_cache=use_cache, cache=cache) if use_cache: encoder_outputs, cached_kvs = outputs[:2] else: encoder_outputs = outputs logits = paddle.matmul( encoder_outputs, get_attr(self.gpt.embeddings.word_embeddings, "weight"), transpose_y=True) if use_cache: return logits, cached_kvs else: return logits class GPTPretrainingCriterion(nn.Layer): """ Criterion for GPT. It calculates the final loss. """ def __init__(self, topo=None): super(GPTPretrainingCriterion, self).__init__() self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none") def forward(self, prediction_scores, masked_lm_labels, loss_mask): """ Args: prediction_scores(Tensor): The logits of masked token prediction. Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size]. masked_lm_labels(Tensor): The labels of the masked language modeling, the dimensionality of `masked_lm_labels` is equal to `prediction_scores`. Its data type should be int64 and its shape is [batch_size, sequence_length, 1]. loss_mask(Tensor): Mask used for calculating the loss of the masked language modeling to avoid calculating some unwanted tokens. Its data type should be float32 and its shape is [batch_size, sequence_length, 1]. Returns: Tensor: The pretraining loss. Its data type should be float32 and its shape is [1]. """ masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2)) loss_mask = loss_mask.reshape([-1]) masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) loss = masked_lm_loss / loss_mask.sum() return loss class GPTForSequenceClassification(nn.Layer): """ GPT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. Args: gpt (:class:`GPTModel`): An instance of GPTModel. num_classes (int, optional): The number of classes. Defaults to `2`. """ def __init__(self, gpt, num_classes=2): super(GPTForSequenceClassification, self).__init__() self.gpt = gpt self.score = nn.Linear( self.gpt.hidden_size, num_classes, bias_attr=False) from paddle.nn.initializer import Normal normal_ = Normal(std=self.gpt.initializer_range) normal_(self.score.weight) def forward(self, input_ids, position_ids=None, attention_mask=None): output = self.gpt(input_ids, position_ids=position_ids, attention_mask=attention_mask) logits = self.score(output) # padding index maybe 0 eos_token_id = 0 # sequence_lengths shape [bs,] sequence_lengths = (input_ids != eos_token_id).astype("int64").sum( axis=-1) - 1 pooled_logits = logits.gather_nd( paddle.stack( [paddle.arange(output.shape[0]), sequence_lengths], axis=-1)) return pooled_logits class GPTForGeneration(nn.Layer): """ GPT Model with pretraining tasks on top. Args: gpt (:class:`GPTModel`): An instance of :class:`GPTModel`. """ def __init__(self, gpt, configs): super(GPTForGeneration, self).__init__() self.gpt = gpt self.configs = configs self.max_length = self.configs.get('max_dec_len', 20) self.min_length = self.configs.get('min_dec_len', 0) self.decode_strategy = self.configs.get('decode_strategy', 'sampling') self.temperature = self.configs.get('temperature', 1.0) self.top_k = self.configs.get('top_k', 0) self.top_p = self.configs.get('top_p', 1.0) self.use_topp_sampling = self.configs.get('use_topp_sampling', False) self.inference = self.configs.get('inference', False) self.repetition_penalty = self.configs.get('repetition_penalty', 1.0) self.num_beams = self.configs.get('num_beams', 1) self.num_beam_groups = self.configs.get('num_beam_groups', 1) self.length_penalty = self.configs.get('length_penalty', 0.0) self.early_stopping = self.configs.get('early_stopping', False) self.bos_token_id = self.configs.get('bos_token_id', None) self.eos_token_id = self.configs.get('eos_token_id', None) self.pad_token_id = self.configs.get('pad_token_id', None) self.decoder_start_token_id = self.configs.get( 'decoder_start_token_id', None) self.forced_bos_token_id = self.configs.get('forced_bos_token_id', None) self.forced_eos_token_id = self.configs.get('forced_eos_token_id', None) self.num_return_sequences = self.configs.get('num_return_sequences', 1) self.diversity_rate = self.configs.get('diversity_rate', 0.0) self.use_cache = self.configs.get('use_cache', True) def prepare_input_ids_for_generation(self, bos_token_id, encoder_output=None): batch_size = 1 if bos_token_id is None: raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.") if encoder_output is not None: batch_size = encoder_output.shape[0] return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id): is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any( input_ids == pad_token_id).numpy().item() is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( (eos_token_id is not None) and (pad_token_id != eos_token_id)) if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: attention_mask = (input_ids == pad_token_id ).astype(paddle.get_default_dtype()) * -1e9 else: attention_mask = paddle.zeros_like( input_ids, dtype=paddle.get_default_dtype()) return paddle.unsqueeze(attention_mask, axis=[1, 2]) def update_scores_for_generation(self, scores, next_scores, length, unfinished_flag): # update scores unfinished_scores = (scores * length + next_scores) / (length + 1) scores = paddle.where(unfinished_flag, unfinished_scores, scores) return scores def get_logits_processor(self, min_length=None, max_length=None, eos_token_id=None, forced_bos_token_id=None, forced_eos_token_id=None, num_beams=1, num_beam_groups=1, diversity_rate=0.0, repetition_penalty=None): processors = LogitsProcessorList() if min_length is not None and eos_token_id is not None and min_length > -1: processors.append( MinLengthLogitsProcessor(min_length, eos_token_id)) if num_beam_groups > 1 and diversity_rate > 0.0: processors.append( HammingDiversityLogitsProcessor( diversity_rate=diversity_rate, num_beams=num_beams, num_beam_groups=num_beam_groups)) if repetition_penalty is not None and repetition_penalty != 1.0: processors.append( RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) if forced_bos_token_id is not None: processors.append( ForcedBOSTokenLogitsProcessor(forced_bos_token_id)) if forced_eos_token_id is not None: processors.append( ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)) # TODO # Add more pre_processing for distribution return processors def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs): index = paddle.tile( paddle.arange(paddle.shape(input_ids)[0]).unsqueeze(-1), [1, expand_size]).reshape([-1]) input_ids = paddle.gather(input_ids, index) if attention_mask is not None: model_kwargs["attention_mask"] = paddle.gather(attention_mask, index) if "token_type_ids" in model_kwargs and model_kwargs[ "token_type_ids"] is not None: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.gather(token_type_ids, index) if "position_ids" in model_kwargs and model_kwargs[ "position_ids"] is not None: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = paddle.gather(position_ids, index) if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None: seq_len = model_kwargs["seq_len"] model_kwargs["seq_len"] = paddle.gather(seq_len, index) if "encoder_output" in model_kwargs and model_kwargs[ "encoder_output"] is not None: encoder_output = model_kwargs["encoder_output"] model_kwargs["encoder_output"] = paddle.gather(encoder_output, index) if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.gather(role_ids, index) return input_ids, model_kwargs def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs): # only last token for inputs_ids if cache is defined in kwargs position_ids = kwargs.get("position_ids", None) attention_mask = kwargs.get("attention_mask", None) if attention_mask is not None: if len(attention_mask.shape) == 4: attention_mask = attention_mask[:, -1, -1, :] if "int" in paddle.common_ops_import.convert_dtype( attention_mask.dtype): attention_mask = (1.0 - attention_mask) * -1e4 return { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask, "cache": cache } def update_model_kwargs_for_generation(self, next_tokens, outputs, model_kwargs, is_encoder_decoder=False): # Update the model inputs during generation. # Note that If `token_type_ids` and `attention_mask` in `model_kwargs` # and they contain pad value, the result vectors updated by this method # may be different from expected. In this case, you need to rewrite the # method. # update cache if isinstance(outputs, tuple): model_kwargs["cache"] = outputs[1] # update token_type_ids with last value if "token_type_ids" in model_kwargs and model_kwargs[ "token_type_ids"] is not None: token_type_ids = model_kwargs["token_type_ids"] model_kwargs["token_type_ids"] = paddle.concat( [token_type_ids, token_type_ids[:, -1:]], axis=-1) # update position_ids if "position_ids" in model_kwargs and model_kwargs[ "position_ids"] is not None: position_ids = model_kwargs["position_ids"] model_kwargs["position_ids"] = position_ids[:, -1:] + 1 # update attention_mask if not is_encoder_decoder and "attention_mask" in model_kwargs: attention_mask = model_kwargs["attention_mask"] # nn.Pad2D don't support the data type `bool` if convert_dtype(attention_mask.dtype) == 'bool': attention_mask = paddle.cast(attention_mask, 'int64') if len(attention_mask.shape) == 4: attention_mask = nn.Pad2D( [0, 0, 0, 1], mode='replicate')(attention_mask) attention_mask = nn.Pad2D( [0, 1, 0, 0], value=-1e4)(attention_mask) dtype = convert_dtype(attention_mask.dtype) if 'int' in dtype: attention_mask[:, :, -1, -1] = 1 elif 'float' in dtype: attention_mask[:, :, -1, -1] = 0.0 else: raise ValueError( 'The data type of input `attention_mask` must ' 'be bool, int or float') else: attention_mask = paddle.concat( [ attention_mask, paddle.ones( [attention_mask.shape[0], 1], dtype="int64") ], axis=-1) model_kwargs["attention_mask"] = attention_mask # update role_ids if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None: role_ids = model_kwargs["role_ids"] model_kwargs["role_ids"] = paddle.concat( [role_ids, role_ids[:, -1:]], axis=-1) model_kwargs['res'] = paddle.concat( [model_kwargs['res'], next_tokens], axis=1) return model_kwargs def sample(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, top_k=None, top_p=None, temperature=None, min_tokens_to_keep=1, **model_kwargs): def TopKProcess(probs, top_k, min_tokens_to_keep): top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1]) # Remove all tokens with a probability less than the last token of the top-k topk_probs, _ = paddle.topk(probs, k=top_k) probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0)) return probs def TopPProcess(probs, top_p, min_tokens_to_keep): sorted_probs = paddle.sort(probs, descending=True) sorted_indices = paddle.argsort(probs, descending=True) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) # Remove tokens with cumulative probs above the top_p, But keep at # least min_tokens_to_keep tokens sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Set 'min_tokens_to_keep - 1' because the first token is kept sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0 # Keep the first token sorted_indices_to_remove = paddle.cast( sorted_indices_to_remove, dtype='int64') sorted_indices_to_remove[:, 1:] = ( sorted_indices_to_remove[:, :-1].clone()) sorted_indices_to_remove[:, 0] = 0 # Scatter sorted tensors to original indexing sorted_indices = sorted_indices + paddle.arange(probs.shape[ 0]).unsqueeze(-1) * probs.shape[-1] condition = paddle.scatter(sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()) condition = paddle.cast(condition, 'bool').reshape(probs.shape) probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs batch_size, cur_len = input_ids.shape # used for compute on gpu, avoid memcpy D2H cur_len_gpu = paddle.full([1], cur_len, dtype='int64') origin_len = input_ids.shape[1] # used for compute on gpu, avoid memcpy D2H origin_len_gpu = paddle.full([1], origin_len, dtype='int64') unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool') scores = paddle.full( [batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) res = paddle.assign(input_ids) model_kwargs['res'] = res # use_cache is immutable, we split it off other mutable kwargs. assert 'use_cache' in model_kwargs immutable = {'use_cache': model_kwargs['use_cache']} del model_kwargs['use_cache'] def _forward_(**args): model_inputs = self.prepare_inputs_for_generation( input_ids, **args, **immutable) return self.gpt(**model_inputs, **immutable) def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs): logits = outputs[0] if isinstance(outputs, tuple) else outputs logits = paddle.matmul( logits, self.gpt.embeddings.word_embeddings.weight, transpose_y=True) # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = logits_processors(input_ids, logits) # sample origin_probs = F.softmax(logits) if temperature is None or temperature == 1.0: probs = paddle.assign(origin_probs) origin_probs = paddle.log(origin_probs) else: origin_probs = paddle.log(origin_probs) logits = logits / temperature probs = F.softmax(logits) if top_k is not None and top_k != 0: probs = TopKProcess(probs, top_k, min_tokens_to_keep) if top_p is not None and top_p < 1.0: if self.use_topp_sampling: try: from ppfleetx_ops import topp_sampling except ImportError: raise ImportError( "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!" ) top_ps_tensor = paddle.full( shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype) _, next_tokens = topp_sampling( probs, top_ps_tensor, random_seed=100) else: probs = TopPProcess(probs, top_p, min_tokens_to_keep) if not self.use_topp_sampling: next_tokens = paddle.multinomial(probs) next_scores = paddle.index_sample(origin_probs, next_tokens) if eos_token_id is not None: next_tokens = paddle.where( unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) scores = self.update_scores_for_generation( scores, next_scores, cur_len - origin_len, unfinished_flag) input_ids = next_tokens if eos_token_id is not None: unfinished_flag = paddle.logical_and( unfinished_flag, next_tokens != eos_token_id) model_kwargs = self.update_model_kwargs_for_generation( next_tokens, outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder) return input_ids, scores, unfinished_flag, model_kwargs # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement # the value in model_kwargs should be tensor before while loop outputs = _forward_(**model_kwargs) input_ids, scores, unfinished_flag, model_kwargs = _post_process_( outputs, input_ids, cur_len_gpu, origin_len_gpu, scores, unfinished_flag, model_kwargs) if not self.inference: cur_len += 1 else: # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static paddle.increment(cur_len) paddle.increment(cur_len_gpu) attn_mask = model_kwargs['attention_mask'] # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static. model_kwargs['attention_mask'] = paddle.reshape( attn_mask, paddle.shape(attn_mask)) model_kwargs['cache'] = outputs[1] if isinstance(outputs, tuple) else None while cur_len < max_length: # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs) # and change it to pass directly to _post_process_ to avoid # closed-loop problem of dynamic-to-static model input_ids, scores, unfinished_flag, model_kwargs = _post_process_( _forward_(**model_kwargs), input_ids, cur_len_gpu, origin_len_gpu, scores, unfinished_flag, model_kwargs) if not self.inference: cur_len += 1 else: # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static paddle.increment(cur_len) paddle.increment(cur_len_gpu) if not paddle.any(unfinished_flag): break return model_kwargs['res'][:, origin_len:], scores def forward(self, input_ids=None, **model_kwargs): max_length = self.max_length min_length = self.min_length decode_strategy = self.decode_strategy temperature = self.temperature top_k = self.top_k top_p = self.top_p repetition_penalty = self.repetition_penalty num_beams = self.num_beams num_beam_groups = self.num_beam_groups length_penalty = self.length_penalty early_stopping = self.early_stopping bos_token_id = self.bos_token_id eos_token_id = self.eos_token_id pad_token_id = self.pad_token_id decoder_start_token_id = self.decoder_start_token_id forced_bos_token_id = self.forced_bos_token_id forced_eos_token_id = self.forced_eos_token_id num_return_sequences = self.num_return_sequences diversity_rate = self.diversity_rate use_cache = self.use_cache assert ( decode_strategy in ["greedy_search", "sampling", "beam_search"] ), "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format( decode_strategy) bos_token_id = bos_token_id if bos_token_id is not None else getattr( self.gpt, 'bos_token_id', None) eos_token_id = eos_token_id if eos_token_id is not None else getattr( self.gpt, 'eos_token_id', None) pad_token_id = pad_token_id if pad_token_id is not None else getattr( self.gpt, 'pad_token_id', None) forced_bos_token_id = forced_bos_token_id if forced_bos_token_id is not None else getattr( self.gpt, 'forced_bos_token_id', None) forced_eos_token_id = forced_eos_token_id if forced_eos_token_id is not None else getattr( self.gpt, 'forced_eos_token_id', None) decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else getattr( self.gpt, 'decoder_start_token_id', None) # params check if input_ids is None: # Init `input_ids` with bos_token_id input_ids = self.prepare_input_ids_for_generation(bos_token_id) if model_kwargs.get("attention_mask", None) is None: # TODO # Init `attention_mask` depending on `pad_token_id` model_kwargs[ "attention_mask"] = self.prepare_attention_mask_for_generation( input_ids, pad_token_id, eos_token_id) if model_kwargs.get("position_ids", None) is None: model_kwargs['position_ids'] = paddle.arange( 0, paddle.shape(model_kwargs['attention_mask'])[-1], dtype=input_ids.dtype).unsqueeze(0) self.is_encoder_decoder = False model_kwargs["use_cache"] = use_cache if self.inference: # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static min_len = input_ids.shape[-1] max_len = input_ids.shape[-1] paddle.increment(min_len, min_length) paddle.increment(max_len, max_length) else: input_len = input_ids.shape[-1] max_len = max_length + input_len min_len = min_length + input_len logits_processors = self.get_logits_processor( min_length=min_len, max_length=max_len, eos_token_id=eos_token_id, forced_bos_token_id=forced_bos_token_id, forced_eos_token_id=forced_eos_token_id, num_beams=num_beams, num_beam_groups=num_beam_groups, diversity_rate=diversity_rate, repetition_penalty=repetition_penalty) if decode_strategy == 'sampling': if num_return_sequences > 1: input_ids, model_kwargs = self.expand_inputs_for_generation( input_ids, expand_size=num_return_sequences, **model_kwargs) ret = self.sample(input_ids, logits_processors, max_len, pad_token_id, eos_token_id, top_k, top_p, temperature, **model_kwargs) else: raise ValueError(f'Not support {decode_strategy} strategy yet!') return ret ================================================ FILE: ppfleetx/models/language_model/language_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import sys import copy import math import numpy as np import types import paddle from paddle.static import InputSpec import paddle.distributed.fleet as fleet from ppfleetx.core.module.basic_module import BasicModule import ppfleetx.models.language_model.gpt as gpt from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import register_sequence_parallel_allreduce_hooks from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger from .utils import process_configs from ppfleetx.data.tokenizers import GPTTokenizer from .metrics import * # TODO(haohongxiang): to solve the problem of cross-reference import paddlenlp from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer MODEL_CLASSES = { "GPT": (GPTTokenizer, "gpt2"), "MoE": (GPTTokenizer, "gpt2"), "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"), } def get_model_size(l, h, v, s): P = 0 # embedding P += (v + s) * h # attention P += (4 * h * h + 4 * h) * l # layer_norm of decoder P += (2 * (2 * h)) * l # FFN Layer P += (8 * h * h + 5 * h) * l # layer_norm of transformer P += 2 * h logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) def vocab_size_with_padding(vocab_size, div_unit, mp_degree): padded_size = vocab_size multiple = div_unit * mp_degree while (padded_size % multiple) != 0: padded_size += 1 logging.warning(' > padded vocab (size: {}) with {} dummy tokens ' '(new size: {})'.format(vocab_size, padded_size - vocab_size, padded_size)) return padded_size class LanguageModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() self.data_world_size = env.get_data_world_size() super(LanguageModule, self).__init__(configs) self.loss_fn = self.get_loss_fn() def process_configs(self, configs): configs = process_configs(configs) return configs def forward(self, tokens, ids): return self.model(tokens, ids) def training_step(self, batch): tokens, position_ids, labels, loss_mask = batch loss_mask.stop_gradient = True labels.stop_gradient = True position_ids.stop_gradient = True preds = self(tokens, position_ids) loss = self.loss_fn(preds, labels, loss_mask) return loss def training_step_end(self, log_dict): speed = 1. / log_dict['train_cost'] default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_seq_len loss_scale_str = "loss_scale: %.9f," % ( log_dict['loss_scale']) if log_dict.get('loss_scale', None) is not None else "" logger.info( "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e, found_inf: %.0f" % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_step'], log_dict['loss'], log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, \ loss_scale_str, log_dict['lr'], log_dict['found_inf'])) def validation_step(self, batch): tokens, position_ids, labels, loss_mask = batch preds = self(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = self.loss_fn(preds, labels, loss_mask) return loss def validation_step_end(self, log_dict): speed = 1. / log_dict['eval_cost'] logger.info( "[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['loss'], log_dict['eval_cost'], speed)) def test_step(self, batch): tokens, position_ids, labels, loss_mask = batch preds = self(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = self.loss_fn(preds, labels, loss_mask) return loss def test_step_end(self, log_dict): speed = 1. / log_dict['test_cost'] logger.info( "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost'], speed)) def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) class GPTModule(LanguageModule): def __init__(self, configs): super(GPTModule, self).__init__(configs) if configs.Model.sequence_parallel: register_sequence_parallel_allreduce_hooks( self, configs.Engine.accumulate_steps, configs.Distributed.fuse_sequence_parallel_allreduce) def get_model(self): model_setting = copy.deepcopy(self.configs.Model) if 'Compress' in self.configs and 'Quantization' in self.configs.Compress: quant_setting = copy.deepcopy(self.configs.Compress.Quantization) skip_tensor_map = quant_setting.get('skip_tensor_map', {}) freeze_embedding = quant_setting.get('freeze_embedding', False) model_setting['skip_tensor_map'] = skip_tensor_map model_setting['freeze_embedding'] = freeze_embedding model_setting.pop("module") model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_len get_model_size(l, h, v, s) if self.nranks == 1: model_setting.pop("sequence_parallel") model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) else: model_setting[ 'num_partitions'] = self.configs.Distributed.mp_degree if self.configs.Distributed.pp_degree == 1: model_setting.pop("virtual_pp_degree", None) model = gpt.GPTForPretrainingHybrid( gpt.GPTModelHybrid(**model_setting)) else: model = gpt.GPTForPretrainingPipe(**model_setting) return model def get_loss_fn(self): if self.nranks == 1: loss_fn = gpt.GPTPretrainingCriterion() else: loss_fn = gpt.GPTPretrainingCriterionHybird( sequence_parallel=self.configs.Model.sequence_parallel) return loss_fn def pretreating_batch(self, batch): if self.configs.Distributed.pp_degree > 1: tokens, position_ids, labels, loss_mask = batch data = [(tokens, position_ids), (labels, loss_mask)] return data else: return batch def input_spec(self): return [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] def inference_end(self, outputs): for k, v in outputs.items(): for i in range(v.shape[0]): out_ids = [int(x) for x in v[i]] ret_str = self.tokenizer.decode(out_ids) # ret_str = text[i] + ret_str print(ret_str) class GPTFinetuneModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() self.data_world_size = env.get_data_world_size() super(GPTFinetuneModule, self).__init__(configs) # self.loss_config will be init in super class by get_model() assert self.loss_config is not None assert 'train' in self.loss_config assert 'eval' in self.loss_config train_loss = copy.deepcopy(self.loss_config.train) train_loss_cls = train_loss.pop('name') self.loss_fn = eval(f'paddle.nn.loss.{train_loss_cls}')(**train_loss) eval_loss = copy.deepcopy(self.loss_config.eval) eval_loss_cls = eval_loss.pop('name') self.eval_loss_fn = eval(f'paddle.nn.loss.{eval_loss_cls}')( **eval_loss) # self.metric_config will be init in super class by get_model() assert self.metric_config is not None assert 'eval' in self.metric_config if 'train' in self.metric_config: train_metric = copy.deepcopy(self.metric_config.train) train_metric_cls = train_metric.pop('name') self.train_metric = eval(f'{train_metric_cls}')(**train_metric) eval_metric = copy.deepcopy(self.metric_config.eval) eval_metric_cls = eval_metric.pop('name') self.eval_metric = eval(f'{eval_metric_cls}')(**eval_metric) self.best_metric = 0.0 def process_configs(self, configs): return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") self.metric_config = model_setting.pop("metric", None) self.loss_config = model_setting.pop("loss", None) pretrained = model_setting.pop("pretrained") num_classes = model_setting.pop("num_classes", 2) assert pretrained is not None model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] num_heads = model_setting['num_attention_heads'] s = self.configs.Data.Train.dataset.max_length get_model_size(l, h, v, s) if self.nranks == 1: model = gpt.GPTForSequenceClassification( gpt.GPTModel(**model_setting), num_classes) else: raise NotImplementedError pretrained_path = pretrained + ".pdparams" assert os.path.exists( pretrained_path), f'{pretrained_path} is not exists!' model_dict = paddle.load(pretrained_path) # Note(GuoxiaWang): Guess whether to convert fused vs non-fused parameters. # 'q_proj' vs 'qkv_proj' def is_fused(model_state): for key in model_state: if 'qkv_proj' in key: return True return False def split_params(model_state, num_layers): for idx in range(num_layers): qkv_b = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias') qkv_w = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight') qkv_b = qkv_b.reshape((num_heads, 3, -1)) qkv_w = qkv_w.reshape((h, num_heads, 3, -1)) q_w, k_w, v_w = np.split(qkv_w, 3, axis=2) q_w = q_w.reshape((h, -1)) k_w = k_w.reshape((h, -1)) v_w = v_w.reshape((h, -1)) q_b, k_b, v_b = np.split(qkv_b, 3, axis=1) q_b = q_b.reshape((-1)) k_b = k_b.reshape((-1)) v_b = v_b.reshape((-1)) model_state[ f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias'] = q_b model_state[ f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight'] = q_w model_state[ f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias'] = k_b model_state[ f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight'] = k_w model_state[ f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias'] = v_b model_state[ f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight'] = v_w return model_state def fuse_params(model_state, num_layers): for idx in range(num_layers): q_b = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.q_proj.bias') q_w = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.q_proj.weight') k_b = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.k_proj.bias') k_w = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.k_proj.weight') v_b = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.v_proj.bias') v_w = model_state.pop( f'gpt.decoder.layers.{idx}.self_attn.v_proj.weight') q_w = q_w.reshape((h, num_heads, -1)) k_w = k_w.reshape((h, num_heads, -1)) v_w = v_w.reshape((h, num_heads, -1)) qkv_w = np.stack([q_w, k_w, v_w], axis=2) qkv_w = qkv_w.reshape((h, -1)) q_b = q_b.reshape((num_heads, -1)) k_b = k_b.reshape((num_heads, -1)) v_b = v_b.reshape((num_heads, -1)) qkv_b = np.stack([q_b, k_b, v_b], axis=1) qkv_b = qkv_b.reshape((-1)) model_state[ f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight'] = qkv_w model_state[ f'gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias'] = qkv_b return model_state fused = is_fused(model.state_dict()) load_fused = is_fused(model_dict) if fused is True and load_fused is False: model_dict = fuse_params(model_dict, l) elif fused is False and load_fused is True: model_dict = split_params(model_dict, l) for name, param in model.state_dict().items(): if name in model_dict and param.dtype != model_dict[name].dtype: model_dict[name] = model_dict[name].cast(param.dtype) model.set_state_dict(model_dict) logger.info(f'Load pretrained weight from {pretrained_path}') return model def forward(self, tokens): return self.model(tokens) def training_step(self, batch): input_ids, labels = batch input_ids.stop_gradient = True labels.stop_gradient = True logits = self(input_ids) loss = self.loss_fn(logits, labels) return loss def training_step_end(self, log_dict): speed = 1. / log_dict['train_cost'] default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_length logger.info( "[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s" % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['lr'], log_dict['loss'], log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size)) def validation_step(self, batch): input_ids, labels = batch input_ids.stop_gradient = True labels.stop_gradient = True logits = self(input_ids) loss = self.eval_loss_fn(logits, labels) correct = self.eval_metric.compute(logits, labels) self.eval_metric.update(correct) return loss def validation_step_end(self, log_dict): speed = 1. / log_dict['eval_cost'] logger.info( "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['eval_cost'], speed)) def test_step(self, batch): tokens, position_ids, labels, loss_mask = batch preds = self(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = self.eval_loss_fn(preds, labels, loss_mask) return loss def test_step_end(self, log_dict): speed = 1. / log_dict['test_cost'] logger.info( "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost'], speed)) def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) def validation_epoch_end(self, log_dict): res = self.eval_metric.accumulate() self.eval_metric.reset() if isinstance(self.eval_metric, AccuracyAndF1): msg = "acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f" % ( res[0], res[1], res[2], res[3], res[4]) metric = res[4] elif isinstance(self.eval_metric, Mcc): msg = "mcc: %.5f" % (res[0]) metric = res[0] elif isinstance(self.eval_metric, PearsonAndSpearman): msg = "pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f" % ( res[0], res[1], res[2]) metric = res[2] else: msg = "acc: %.5f" % (res) metric = res if metric > self.best_metric: self.best_metric = metric logger.info( "[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f" % (log_dict['epoch'], log_dict['eval_cost'], msg, self.best_metric)) class GPTGenerationModule(BasicModule): def __init__(self, configs): self.configs = configs self.generation_cfgs = configs.Generation self.nranks = paddle.distributed.get_world_size() super().__init__(configs) def process_configs(self, configs): configs = process_configs(configs) return configs def get_model(self): model_setting = copy.deepcopy(self.configs.Model) if 'Compress' in self.configs and 'Quantization' in self.configs.Compress: quant_setting = copy.deepcopy(self.configs.Compress.Quantization) skip_tensor_map = quant_setting.get('skip_tensor_map', {}) freeze_embedding = quant_setting.get('freeze_embedding', False) model_setting['skip_tensor_map'] = skip_tensor_map model_setting['freeze_embedding'] = freeze_embedding model_setting.pop("module") model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) if self.nranks == 1: model = gpt.GPTForGeneration( gpt.GPTModel(**model_setting), self.generation_cfgs) else: assert self.nranks == self.configs.Distributed.dp_degree, \ "only support single card and data parallel in generation task." model = gpt.GPTForGenerationHybrid( gpt.GPTModelHybrid(**model_setting), self.generation_cfgs) self.generation_cfgs['max_dec_len'] = self.adjust_length_to_model( self.generation_cfgs['max_dec_len'], 512) self.generation_cfgs['bos_token_id'] = self.tokenizer.eos_token_id self.generation_cfgs['eos_token_id'] = self.tokenizer.eos_token_id self.generation_cfgs['pad_token_id'] = self.tokenizer.eos_token_id return model def adjust_length_to_model(self, length, max_sequence_length): if length < 0 or length > max_sequence_length: length = max_sequence_length return length def left_padding(self, inputs, pad_id, padding="longest"): assert "input_ids" in inputs, "input_ids should be in inputs!" max_length = 0 for ids in inputs["input_ids"]: max_length = max(max_length, len(ids)) def extend_max_lenth(value, max_length, to_pad_id): return [to_pad_id] * (max_length - len(value)) + value def extend_filed(name, max_length, to_pad_id): values = inputs[name] res = [] for index, value in enumerate(values): res.append(extend_max_lenth(value, max_length, to_pad_id)) inputs[name] = res extend_filed("input_ids", max_length, pad_id) if "attention_mask" in inputs: extend_filed("attention_mask", max_length, 0) if "position_ids" in inputs: extend_filed("position_ids", max_length, 0) return inputs def generate(self, input_text): return self(input_text) def forward(self, input_text): input_ids = self.tokenizer.encode(input_text) inputs = {'input_ids': [input_ids]} inputs = self.left_padding(inputs, self.tokenizer.eos_token_id) input_ids = inputs['input_ids'] if len(input_ids) == 0: input_ids = None else: # [1, seq_len] input_ids = paddle.to_tensor(input_ids, dtype='int64') ids, scores = self.model(input_ids=input_ids) generated_sequences = [] for i, generated_ids in enumerate(ids): generated_ids = generated_ids.numpy().tolist() # Decode text text = self.tokenizer.convert_ids_to_string(generated_ids) sequence = input_text + text generated_sequences.append(sequence) return generated_sequences def input_spec(self): return [InputSpec(shape=[None, None], name="input_ids", dtype='int64')] class GPTEvalModule(LanguageModule): def __init__(self, configs): self.eval_cfgs = configs.Offline_Eval super().__init__(configs) self.post_process_configs() self.first_step = True self.total_score = 0 self.score_name = "loss" if not self.eval_cfgs.cloze_eval else "number correct" def post_process_configs(self): self.configs.pop("Optimizer", None) self.configs.pop("Inference", None) self.configs.Data.pop("Train", None) self.configs.Data.pop("Test", None) self.configs.Data.Eval.pop("sampler", None) self.configs.Data.Eval.loader.collate_fn = "gpt_collate_fn" self.configs.Data.Eval.loader.batch_size = self.eval_cfgs.batch_size self.configs.Data.Eval.dataset.input_dir = self.eval_cfgs.eval_path self.configs.Data.Eval.dataset.max_seq_len = self.eval_cfgs.max_seq_len self.configs.Engine.logging_freq = self.eval_cfgs.logging_freq if not self.eval_cfgs.cloze_eval: self.configs.Data.Eval.dataset.name = "LM_Eval_Dataset" self.configs.Data.Eval.dataset.overlapping_eval = self.eval_cfgs.overlapping_eval else: self.configs.Data.Eval.dataset.name = "Lambada_Eval_Dataset" def get_model(self): model_setting = copy.deepcopy(self.configs.Model) if 'Compress' in self.configs and 'Quantization' in self.configs.Compress: quant_setting = copy.deepcopy(self.configs.Compress.Quantization) skip_tensor_map = quant_setting.get('skip_tensor_map', {}) freeze_embedding = quant_setting.get('freeze_embedding', False) model_setting['skip_tensor_map'] = skip_tensor_map model_setting['freeze_embedding'] = freeze_embedding model_setting.pop("module") model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) if self.nranks == 1: model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) else: raise RuntimeError( "Only single-card offline eval is supported in GPTModel now.") return model def forward(self, tokens, ids, mask): return self.model(tokens, ids, mask) def validation_step(self, batch): tokens, loss_mask, attention_mask, position_ids, labels, info = batch preds = self(tokens, position_ids, attention_mask) if not self.eval_cfgs.cloze_eval: if self.first_step: self.num_original_tokens = info.numpy()[0][0] self.num_tokenized_tokens = info.numpy()[0][1] masked_lm_loss = paddle.nn.functional.cross_entropy( preds, labels, reduction="none") loss = paddle.sum(masked_lm_loss * loss_mask) return loss else: if self.first_step: self.num_examples = info.numpy()[0][0] outputs = paddle.argmax(preds, -1) acc = paddle.cast(outputs == labels, 'float32') acc = paddle.where( paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc)) acc = paddle.sum(paddle.prod(acc, -1)) return acc self.first_step = False def validation_step_end(self, log_dict): speed = 1. / log_dict['eval_cost'] if not self.eval_cfgs.cloze_eval: self.total_score += log_dict[ 'loss'] * self.configs.Engine.logging_freq / ( self.num_tokenized_tokens - 1) else: self.total_score += log_dict[ 'loss'] * self.configs.Engine.logging_freq logger.info("[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], self.score_name, self.total_score, speed)) def validation_epoch_end(self, log_dict): if not self.eval_cfgs.cloze_eval: total_loss = float(self.total_score) ppl = math.exp(min(20, total_loss)) token_ratio = (self.num_tokenized_tokens - 1) / ( self.num_original_tokens - 1) adjusted_ppl = math.exp(min(20, total_loss * token_ratio)) string = ' validation results on {} | '.format( self.eval_cfgs.eval_path) string += 'avg loss: {:.4E} | '.format(total_loss) string += 'ppl: {:.4E} | '.format(ppl) string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) string += 'token ratio: {} |'.format(token_ratio) else: num_correct = float(self.total_score) acc = float(num_correct / self.num_examples) string = ' validation results on {} | '.format( self.eval_cfgs.eval_path) string += 'number correct: {:.4E} | '.format(num_correct) string += 'total examples: {:.4E} | '.format(self.num_examples) string += 'avg accuracy: {:.4E}'.format(acc) logger.info(string) def input_spec(self): return [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] class MoEModule(LanguageModule): def __init__(self, configs): super(MoEModule, self).__init__(configs) assert self.nranks == configs.Distributed.dp_degree, \ "only support single card or data parallel in MoE model." def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") model_setting.pop("name") l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_len get_model_size(l, h, v, s) if self.nranks == 1: model_setting.pop("sequence_parallel") model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) else: model_setting[ 'num_partitions'] = self.configs.Distributed.mp_degree if self.configs.Distributed.pp_degree == 1: model_setting.pop("virtual_pp_degree", None) model = gpt.GPTForPretrainingHybrid( gpt.GPTModelHybrid(**model_setting)) else: model = gpt.GPTForPretrainingPipe(**model_setting) return model def get_loss_fn(self): if self.nranks == 1: loss_fn = gpt.GPTPretrainingCriterion() else: loss_fn = gpt.GPTPretrainingCriterionHybird() return loss_fn def training_step(self, batch): tokens, position_ids, labels, loss_mask = batch loss_mask.stop_gradient = True labels.stop_gradient = True position_ids.stop_gradient = True preds = self(tokens, position_ids) loss = self.loss_fn(preds, labels, loss_mask) with paddle.amp.auto_cast(enable=False): if self.configs.Model.gate != "naive" and \ self.configs.Model.balance_loss_weight: gpt_layer = self.model._layers.gpt if isinstance( self.model, paddle.DataParallel) else self.model.gpt aux_loss_list = [ l.moe_mlp.gate.get_loss(clear=False) for l in gpt_layer.decoder.layers if hasattr(l.moe_mlp, "gate") ] bal_loss = paddle.concat(aux_loss_list) if bal_loss.dtype == paddle.float16: bal_loss = paddle.cast(bal_loss, dtype=paddle.float32) bal_loss = bal_loss.mean() loss += bal_loss * self.configs.Engine.balance_loss_weight return loss def initialize_mp_dp_parameters(self): hcg = env.get_hcg() mp_group = hcg.get_model_parallel_group() mp_src_rank = hcg.get_model_parallel_group_src_rank() dp_group = hcg.get_data_parallel_group() dp_src_rank = hcg.get_data_parallel_group_src_rank() for param in self.model.parameters(): if "expert_" in param.name: setattr(param, "no_sync", True) continue if not param.is_distributed: paddle.distributed.broadcast( param.detach(), src=mp_src_rank, group=mp_group, use_calc_stream=True) paddle.distributed.broadcast( param.detach(), src=dp_src_rank, group=dp_group, use_calc_stream=True) ================================================ FILE: ppfleetx/models/language_model/metrics.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import math import warnings from functools import partial import numpy as np import paddle from paddle.metric import Metric, Accuracy, Precision, Recall __all__ = [ 'Accuracy', 'AccuracyAndF1', 'Mcc', 'PearsonAndSpearman', 'MultiLabelsMetric' ] class AccuracyAndF1(Metric): """ This class encapsulates Accuracy, Precision, Recall and F1 metric logic, and `accumulate` function returns accuracy, precision, recall and f1. The overview of all metrics could be seen at the document of `paddle.metric `_ for details. Args: topk (int or tuple(int), optional): Number of top elements to look at for computing accuracy. Defaults to (1,). pos_label (int, optional): The positive label for calculating precision and recall. Defaults to 1. name (str, optional): String name of the metric instance. Defaults to 'acc_and_f1'. Example: .. code-block:: import paddle from paddlenlp.metrics import AccuracyAndF1 x = paddle.to_tensor([[0.1, 0.9], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3]]) y = paddle.to_tensor([[1], [0], [1], [1]]) m = AccuracyAndF1() correct = m.compute(x, y) m.update(correct) res = m.accumulate() print(res) # (0.5, 0.5, 0.3333333333333333, 0.4, 0.45) """ def __init__(self, topk=(1, ), pos_label=1, name='acc_and_f1', *args, **kwargs): super(AccuracyAndF1, self).__init__(*args, **kwargs) self.topk = topk self.pos_label = pos_label self._name = name self.acc = Accuracy(self.topk, *args, **kwargs) self.precision = Precision(*args, **kwargs) self.recall = Recall(*args, **kwargs) self.reset() def compute(self, pred, label, *args): """ Accepts network's output and the labels, and calculates the top-k (maximum value in topk) indices for accuracy. Args: pred (Tensor): Predicted tensor, and its dtype is float32 or float64, and has a shape of [batch_size, num_classes]. label (Tensor): The ground truth tensor, and its dtype is is int64, and has a shape of [batch_size, 1] or [batch_size, num_classes] in one hot representation. Returns: Tensor: Correct mask, each element indicates whether the prediction equals to the label. Its' a tensor with a data type of float32 and has a shape of [batch_size, topk]. """ self.label = label self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label] return self.acc.compute(pred, label) def update(self, correct, *args): """ Updates the metrics states (accuracy, precision and recall), in order to calculate accumulated accuracy, precision and recall of all instances. Args: correct (Tensor): Correct mask for calculating accuracy, and it's a tensor with shape [batch_size, topk] and has a dtype of float32. """ self.acc.update(correct) self.precision.update(self.preds_pos, self.label) self.recall.update(self.preds_pos, self.label) def accumulate(self): """ Calculates and returns the accumulated metric. Returns: tuple: The accumulated metric. A tuple of shape (acc, precision, recall, f1, average_of_acc_and_f1) With the fields: - `acc` (numpy.float64): The accumulated accuracy. - `precision` (numpy.float64): The accumulated precision. - `recall` (numpy.float64): The accumulated recall. - `f1` (numpy.float64): The accumulated f1. - `average_of_acc_and_f1` (numpy.float64): The average of accumulated accuracy and f1. """ acc = self.acc.accumulate() precision = self.precision.accumulate() recall = self.recall.accumulate() if precision == 0.0 or recall == 0.0: f1 = 0.0 else: # 1/f1 = 1/2 * (1/precision + 1/recall) f1 = (2 * precision * recall) / (precision + recall) return ( acc, precision, recall, f1, (acc + f1) / 2, ) def reset(self): """ Resets all metric states. """ self.acc.reset() self.precision.reset() self.recall.reset() self.label = None self.preds_pos = None def name(self): """ Returns name of the metric instance. Returns: str: The name of the metric instance. """ return self._name class Mcc(Metric): """ This class calculates `Matthews correlation coefficient `_ . Args: name (str, optional): String name of the metric instance. Defaults to 'mcc'. Example: .. code-block:: import paddle from paddlenlp.metrics import Mcc x = paddle.to_tensor([[-0.1, 0.12], [-0.23, 0.23], [-0.32, 0.21], [-0.13, 0.23]]) y = paddle.to_tensor([[1], [0], [1], [1]]) m = Mcc() (preds, label) = m.compute(x, y) m.update((preds, label)) res = m.accumulate() print(res) # (0.0,) """ def __init__(self, name='mcc', *args, **kwargs): super(Mcc, self).__init__(*args, **kwargs) self._name = name self.tp = 0 # true positive self.fp = 0 # false positive self.tn = 0 # true negative self.fn = 0 # false negative def compute(self, pred, label, *args): """ Processes the pred tensor, and returns the indices of the maximum of each sample. Args: pred (Tensor): The predicted value is a Tensor with dtype float32 or float64. Shape is [batch_size, 1]. label (Tensor): The ground truth value is Tensor with dtype int64, and its shape is [batch_size, 1]. Returns: tuple: A tuple of preds and label. Each shape is [batch_size, 1], with dtype float32 or float64. """ preds = paddle.argsort(pred, descending=True)[:, :1] return (preds, label) def update(self, preds_and_labels): """ Calculates states, i.e. the number of true positive, false positive, true negative and false negative samples. Args: preds_and_labels (tuple[Tensor]): Tuple of predicted value and the ground truth label, with dtype float32 or float64. Each shape is [batch_size, 1]. """ preds = preds_and_labels[0] labels = preds_and_labels[1] if isinstance(preds, paddle.Tensor): preds = preds.numpy() if isinstance(labels, paddle.Tensor): labels = labels.numpy().reshape(-1, 1) sample_num = labels.shape[0] for i in range(sample_num): pred = preds[i] label = labels[i] if pred == 1: if pred == label: self.tp += 1 else: self.fp += 1 else: if pred == label: self.tn += 1 else: self.fn += 1 def accumulate(self): """ Calculates and returns the accumulated metric. Returns: tuple: Returns the accumulated metric, a tuple of shape (mcc,), `mcc` is the accumulated mcc and its data type is float64. """ if self.tp == 0 or self.fp == 0 or self.tn == 0 or self.fn == 0: mcc = 0.0 else: # mcc = (tp*tn-fp*fn)/ sqrt(tp+fp)(tp+fn)(tn+fp)(tn+fn)) mcc = (self.tp * self.tn - self.fp * self.fn) / math.sqrt( (self.tp + self.fp) * (self.tp + self.fn) * (self.tn + self.fp) * (self.tn + self.fn)) return (mcc, ) def reset(self): """ Resets all metric states. """ self.tp = 0 # true positive self.fp = 0 # false positive self.tn = 0 # true negative self.fn = 0 # false negative def name(self): """ Returns name of the metric instance. Returns: str: The name of the metric instance. """ return self._name class PearsonAndSpearman(Metric): """ The class calculates `Pearson correlation coefficient `_ and `Spearman's rank correlation coefficient `_ . Args: name (str, optional): String name of the metric instance. Defaults to 'pearson_and_spearman'. Example: .. code-block:: import paddle from paddlenlp.metrics import PearsonAndSpearman x = paddle.to_tensor([[0.1], [1.0], [2.4], [0.9]]) y = paddle.to_tensor([[0.0], [1.0], [2.9], [1.0]]) m = PearsonAndSpearman() m.update((x, y)) res = m.accumulate() print(res) # (0.9985229081857804, 1.0, 0.9992614540928901) """ def __init__(self, name='pearson_and_spearman', *args, **kwargs): super(PearsonAndSpearman, self).__init__(*args, **kwargs) self._name = name self.preds = [] self.labels = [] def update(self, preds_and_labels): """ Ensures the type of preds and labels is numpy.ndarray and reshapes them into [-1, 1]. Args: preds_and_labels (tuple[Tensor] or list[Tensor]): Tuple or list of predicted value and the ground truth label. Its data type should be float32 or float64 and its shape is [batch_size, d0, ..., dN]. """ preds = preds_and_labels[0] labels = preds_and_labels[1] if isinstance(preds, paddle.Tensor): preds = preds.numpy() if isinstance(labels, paddle.Tensor): labels = labels.numpy() preds = np.squeeze(preds.reshape(-1, 1)).tolist() labels = np.squeeze(labels.reshape(-1, 1)).tolist() self.preds.append(preds) self.labels.append(labels) def accumulate(self): """ Calculates and returns the accumulated metric. Returns: tuple: Returns the accumulated metric, a tuple of (pearson, spearman, the_average_of_pearson_and_spearman). With the fields: - `pearson` (numpy.float64): The accumulated pearson. - `spearman` (numpy.float64): The accumulated spearman. - `the_average_of_pearson_and_spearman` (numpy.float64): The average of accumulated pearson and spearman correlation coefficient. """ preds = [item for sublist in self.preds for item in sublist] labels = [item for sublist in self.labels for item in sublist] pearson = self.pearson(preds, labels) spearman = self.spearman(preds, labels) return ( pearson, spearman, (pearson + spearman) / 2, ) def pearson(self, preds, labels): n = len(preds) # simple sums sum1 = sum(float(preds[i]) for i in range(n)) sum2 = sum(float(labels[i]) for i in range(n)) # sum up the squares sum1_pow = sum([pow(v, 2.0) for v in preds]) sum2_pow = sum([pow(v, 2.0) for v in labels]) # sum up the products p_sum = sum([preds[i] * labels[i] for i in range(n)]) numerator = p_sum - (sum1 * sum2 / n) denominator = math.sqrt( (sum1_pow - pow(sum1, 2) / n) * (sum2_pow - pow(sum2, 2) / n)) if denominator == 0: return 0.0 return numerator / denominator def spearman(self, preds, labels): preds_rank = self.get_rank(preds) labels_rank = self.get_rank(labels) total = 0 n = len(preds) for i in range(n): total += pow((preds_rank[i] - labels_rank[i]), 2) spearman = 1 - float(6 * total) / (n * (pow(n, 2) - 1)) return spearman def get_rank(self, raw_list): x = np.array(raw_list) r_x = np.empty(x.shape, dtype=int) y = np.argsort(-x) for i, k in enumerate(y): r_x[k] = i + 1 return r_x def reset(self): """ Resets all metric states. """ self.preds = [] self.labels = [] def name(self): """ Returns name of the metric instance. Returns: str: The name of the metric instance. """ return self._name class MultiLabelsMetric(Metric): """ This class encapsulates Accuracy, Precision, Recall and F1 metric logic in multi-labels setting (also the binary setting). Some codes are taken and modified from sklearn.metrics . Args: num_labels (int) The total number of labels which is usually the number of classes name (str, optional): String name of the metric instance. Defaults to 'multi_labels_metric'. Example: .. code-block:: import paddle from paddlenlp.metrics import MultiLabelsMetric x = paddle.to_tensor([[0.1, 0.2, 0.9], [0.5, 0.8, 0.5], [0.6, 1.5, 0.4], [2.8, 0.7, 0.3]]) y = paddle.to_tensor([[2], [1], [2], [1]]) m = MultiLabelsMetric(num_labels=3) args = m.compute(x, y) m.update(args) result1 = m.accumulate(average=None) # (array([0.0, 0.5, 1.0]), array([0.0, 0.5, 0.5]), array([0.0, 0.5, 0.66666667])) result2 = m.accumulate(average='binary', pos_label=0) # (0.0, 0.0, 0.0) result3 = m.accumulate(average='binary', pos_label=1) # (0.5, 0.5, 0.5) result4 = m.accumulate(average='binary', pos_label=2) # (1.0, 0.5, 0.6666666666666666) result5 = m.accumulate(average='micro') # (0.5, 0.5, 0.5) result6 = m.accumulate(average='macro') # (0.5, 0.3333333333333333, 0.38888888888888884) result7 = m.accumulate(average='weighted') # (0.75, 0.5, 0.5833333333333333) Note: When zero_division is encountered (details as followed), the corresponding metrics will be set to 0.0 precision is zero_division if there are no positive predictions recall is zero_division if there are no positive labels fscore is zero_division if all labels AND predictions are negative """ def __init__(self, num_labels, name='multi_labels_metric'): super(MultiLabelsMetric, self).__init__() if num_labels <= 1: raise ValueError( f"The num_labels is {num_labels}, which must be greater than 1." ) self.num_labels = num_labels self._name = name self._confusion_matrix = np.zeros((num_labels, 2, 2), dtype=int) def update(self, args): """ Updates the metrics states (accuracy, precision and recall), in order to calculate accumulated accuracy, precision and recall of all instances. Args: args (tuple of Tensor): the tuple returned from `compute` function """ pred = args[0].numpy() label = args[1].numpy() tmp_confusion_matrix = self._multi_labels_confusion_matrix(pred, label) self._confusion_matrix += tmp_confusion_matrix def accumulate(self, average=None, pos_label=1): """ Calculates and returns the accumulated metric. Args: average (str in {‘binary’, ‘micro’, ‘macro’, ’weighted’} or None, optional): Defaults to `None`. If `None`, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: - `binary` : Only report results for the class specified by pos_label. - `micro` : Calculate metrics globally by counting the total true positives, false negatives and false positives. - `macro` : Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. - `weighted` : Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `macro` to account for label imbalance; it can result in an F-score that is not between precision and recall. pos_label (int, optional): The positive label for calculating precision and recall in binary settings. Noted: Only when `average='binary'`, this arguments will be used. Otherwise, it will be ignored. Defaults to 1. Returns: tuple: The accumulated metric. A tuple of shape (precision, recall, f1) With the fields: - `precision` (numpy.float64 or numpy.ndarray if average=None): The accumulated precision. - `recall` (numpy.float64 or numpy.ndarray if average=None): The accumulated recall. - `f1` (numpy.float64 or numpy.ndarray if average=None): The accumulated f1. """ if average not in {'binary', 'micro', 'macro', 'weighted', None}: raise ValueError(f"The average is {average}, which is unknown.") if average == 'binary': if pos_label >= self.num_labels: raise ValueError( f"The pos_label is {pos_label}, num_labels is {self.num_labels}. " f"The num_labels must be greater than pos_label.") confusion_matrix = None # [*, 2, 2] if average == 'binary': confusion_matrix = np.expand_dims( self._confusion_matrix[pos_label], axis=0) elif average == 'micro': confusion_matrix = self._confusion_matrix.sum(axis=0, keepdims=True) # if average is 'macro' or 'weighted' or None else: confusion_matrix = self._confusion_matrix tp = confusion_matrix[:, 1, 1] # [*,] pred = tp + confusion_matrix[:, 0, 1] # [*,] true = tp + confusion_matrix[:, 1, 0] # [*,] def _robust_divide(numerator, denominator, metric_name): mask = denominator == 0.0 denominator = denominator.copy() denominator[mask] = 1 # avoid zero division result = numerator / denominator if not np.any(mask): return result # precision is zero_division if there are no positive predictions # recall is zero_division if there are no positive labels # fscore is zero_division if all labels AND predictions are negative warnings.warn(f'Zero division when calculating {metric_name}.', UserWarning) result[mask] = 0.0 return result precision = _robust_divide(tp, pred, 'precision') recall = _robust_divide(tp, true, 'recall') f1 = _robust_divide(2 * (precision * recall), (precision + recall), 'f1') weights = None # [num_labels] if average == 'weighted': weights = true if weights.sum() == 0: zero_division_value = np.float64(0.0) if pred.sum() == 0: return (zero_division_value, zero_division_value, zero_division_value) else: return (np.float64(0.0), zero_division_value, np.float64(0.0)) elif average == 'macro': weights = np.ones((self.num_labels), dtype=float) if average is not None: precision = np.average(precision, weights=weights) recall = np.average(recall, weights=weights) f1 = np.average(f1, weights=weights) return precision, recall, f1 def compute(self, pred, label): """ Accepts network's output and the labels, and calculates the top-k (maximum value in topk) indices for accuracy. Args: pred (Tensor): Predicted tensor, and its dtype is float32 or float64, and has a shape of [batch_size, *, num_labels]. label (Tensor): The ground truth tensor, and its dtype is is int64, and has a shape of [batch_size, *] or [batch_size, *, num_labels] in one hot representation. Returns: tuple of Tensor: it contains two Tensor of shape [*, 1]. The tuple should be passed to `update` function. """ if not (paddle.is_tensor(pred) and paddle.is_tensor(label)): raise ValueError('pred and label must be paddle tensor') if pred.shape[-1] != self.num_labels: raise ValueError(f'The last dim of pred is {pred.shape[-1]}, ' f'which should be num_labels') pred = paddle.reshape(pred, [-1, self.num_labels]) pred = paddle.argmax(pred, axis=-1) if label.shape[-1] == self.num_labels: label = paddle.reshape(label, [-1, self.num_labels]) label = paddle.argmax(label, axis=-1) else: label = paddle.reshape(label, [-1]) if paddle.max(label) >= self.num_labels: raise ValueError( f"Tensor label has value {paddle.max(label)}, " f"which is no less than num_labels") if pred.shape[0] != label.shape[0]: raise ValueError( f"The length of pred is not equal to the length of label") return pred, label def _multi_labels_confusion_matrix(self, pred, label): tp_bins = label[pred == label] tp = np.bincount(tp_bins, minlength=self.num_labels) # [num_labels,] tp_plus_fp = np.bincount( pred, minlength=self.num_labels) # [num_labels,] tp_plus_fn = np.bincount( label, minlength=self.num_labels) # [num_labels,] fp = tp_plus_fp - tp # [num_labels,] fn = tp_plus_fn - tp # [num_labels,] tn = pred.shape[0] - tp - fp - fn # [num_labels,] return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2) # [num_labels, 2, 2] def reset(self): self._confusion_matrix = np.zeros((self.num_labels, 2, 2), dtype=int) def name(self): """ Returns name of the metric instance. Returns: str: The name of the metric instance. """ return self._name ================================================ FILE: ppfleetx/models/language_model/moe/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .gate import GShardGate, BaseGate, SwitchGate, NaiveGate from .moe_layer import MoELayer ================================================ FILE: ppfleetx/models/language_model/moe/comm/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/moe/comm_ops.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). import paddle from paddle.autograd import PyLayer from paddle.distributed.utils.moe_utils import global_scatter, global_gather from .utils import _local_scatter, _local_gather, _all_gather class MoEScatter(PyLayer): r""" Scatter input samples from [batch x sequences] to contiguous alone experts. If `world_size` is greater than 1, the samples will first be locally scattered, and then exchanged across workers. """ @staticmethod def forward(ctx, inp, pos, local_expert_count, global_expert_count, fwd_batch_size, world_size, group=None): local_input_buf = _local_scatter(inp, pos) if world_size > 1: global_input_buf = global_scatter( local_input_buf, local_expert_count, global_expert_count, group=group) else: global_input_buf = local_input_buf ctx.moe_args = inp.shape[0], world_size, group variables = (pos, local_expert_count, global_expert_count) ctx.save_for_backward(*variables) return global_input_buf @staticmethod def backward(ctx, grad): (pos, local_expert_count, global_expert_count) = ctx.saved_tensor() (inp_batch_size, world_size, group) = ctx.moe_args if world_size > 1: local_grad_in = global_gather( grad, local_expert_count, global_expert_count, group=group) else: local_grad_in = grad grad_in = _local_gather(local_grad_in, pos, inp_batch_size) return grad_in, None, None, None class MoEGather(PyLayer): r""" Gather output samples from contiguous alone experts back to [batch x sequences]. Works symmetrically with MoEScatter. """ @staticmethod def forward(ctx, global_output_buf, pos, local_expert_count, global_expert_count, local_batch_size, world_size, group=None): if world_size > 1: local_output_buf = global_gather( global_output_buf, local_expert_count, global_expert_count, group=group) else: local_output_buf = global_output_buf output = _local_gather( local_output_buf, pos, local_batch_size, maybe_overlap=False) ctx.moe_args = (global_output_buf.shape[0], world_size, group) variables = (pos, local_expert_count, global_expert_count) ctx.save_for_backward(*variables) return output @staticmethod def backward(ctx, grad_out): pos, local_expert_count, global_expert_count = ctx.saved_tensor() fwd_batch_size, world_size, group = ctx.moe_args grad_out_buf = _local_scatter(grad_out, pos) if world_size > 1: global_grad_out_buf = global_scatter( grad_out_buf, local_expert_count, global_expert_count, group=group) else: global_grad_out_buf = grad_out_buf return global_grad_out_buf, None, None, None class AllGather(PyLayer): r""" A wrapper for the All-Gather function to support auto-differentiation. """ @staticmethod def forward(ctx, inp, rank, world_size, group): tensor_list = [] paddle.distributed.all_gather(tensor_list, inp, group=group) output = paddle.concat(tensor_list, axis=0) ctx.args = rank, inp.shape[0] return output @staticmethod def backward(ctx, grad_out): rank, dim0 = ctx.args return paddle.slice( grad_out, axes=[0], starts=[rank * dim0], ends=[(rank + 1) * dim0]) class Slice(PyLayer): r""" A wrapper for the Slice function to support auto-differentiation. """ @staticmethod def forward(ctx, inp, rank, world_size, group): B = inp.shape[0] local_batch_size = B // world_size batch_start = local_batch_size * rank batch_end = min(batch_start + local_batch_size, B) inp = paddle.slice( inp, axes=[0], starts=[batch_start], ends=[batch_end]) ctx.args = world_size, group return inp @staticmethod def backward(ctx, grad_out): world_size, group = ctx.args return _all_gather(grad_out, group=group) ================================================ FILE: ppfleetx/models/language_model/moe/gate/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .gshard_gate import GShardGate from .switch_gate import SwitchGate from .naive_gate import NaiveGate from .base_gate import BaseGate ================================================ FILE: ppfleetx/models/language_model/moe/gate/base_gate.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/gates/base_gate.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). import paddle.nn as nn class BaseGate(nn.Layer): def __init__(self, num_expert, group=None): super().__init__() self.world_size = group.nranks if group is not None else 1 self.num_expert = num_expert self.tot_expert = self.world_size * num_expert self.loss = None def forward(self, x): raise NotImplementedError("Please implement the forward function.") def set_loss(self, loss): self.loss = loss def get_loss(self, clear=True): loss = self.loss if clear: self.loss = None return loss ================================================ FILE: ppfleetx/models/language_model/moe/gate/gshard_gate.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). import math import paddle import paddle.nn.functional as F from .naive_gate import NaiveGate from ..utils import limit_by_capacity class GShardGate(NaiveGate): def __init__(self, d_model, num_expert, topk=2, capacity=(1.2, 2.4), random_routing=True, group=None): assert topk == 2, "topk should be 2 in gshard" super().__init__(d_model, num_expert, group) self.capacity = capacity self.random_routing = random_routing self.group = group def forward(self, x): topk_val, topk_idx, gate_score = super().forward( x, return_all_scores=True) s = gate_score.shape[0] top1_idx = topk_idx.flatten() c_e = paddle.scatter( paddle.zeros(shape=[self.tot_expert]), top1_idx, paddle.ones_like( top1_idx, dtype="float32"), overwrite=False) / s m_e = paddle.mean(F.softmax(gate_score, axis=1), axis=0) loss = paddle.mean(c_e * m_e) * (self.num_expert**2) self.set_loss(loss) cap_rate = self.capacity[0 if self.training else 1] capacity = math.ceil(cap_rate * x.shape[0]) _new_lec, _new_gec, topk_idx = limit_by_capacity( topk_idx, self.num_expert, self.world_size, capacity, group=self.group) if self.random_routing: rand_routing_prob = paddle.rand( shape=[gate_score.shape[0]], dtype="float32") topk_idx = paddle.distributed.models.moe.utils._random_routing( topk_idx, topk_val, rand_routing_prob) return topk_val, topk_idx ================================================ FILE: ppfleetx/models/language_model/moe/gate/naive_gate.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/gates/naive_gate.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). from .base_gate import BaseGate import paddle import paddle.nn as nn class NaiveGate(BaseGate): def __init__(self, d_model, num_expert, group=None, topk=2): super().__init__(num_expert, group) self.gate = nn.Linear(d_model, self.tot_expert) self.gate.weight.name = "gate_" + self.gate.weight.name self.gate.bias.name = "gate_" + self.gate.bias.name self.top_k = topk def forward(self, inp, return_all_scores=False): gate = self.gate(inp) gate_top_k_val, gate_top_k_idx = paddle.topk( gate, k=self.top_k, axis=-1, largest=True, sorted=False) if return_all_scores: return gate_top_k_val, gate_top_k_idx, gate return gate_top_k_val, gate_top_k_idx ================================================ FILE: ppfleetx/models/language_model/moe/gate/switch_gate.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/gates/switch_gate.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). import math import paddle import paddle.nn.functional as F from .naive_gate import NaiveGate from ..utils import limit_by_capacity class SwitchGate(NaiveGate): def __init__(self, d_model, num_expert, topk=1, switch_eps=.1, capacity=(1.2, 2.4), group=None): assert topk == 1, "topk should be 1 in switch" super().__init__(d_model, num_expert, group, topk=1) self.switch_eps = switch_eps self.capacity = capacity self.group = group def forward(self, inp): score = self.gate(inp) if self.training: noise = paddle.rand(shape=score.shape) noise = noise * 2 * self.switch_eps + 1.0 - self.switch_eps score += noise score = F.softmax(score, axis=-1) top1_score, top1_idx = paddle.topk(score, k=1, axis=-1, largest=True) cap_rate = self.capacity[0 if self.training else 1] capacity = math.ceil(cap_rate * inp.shape[0]) _new_lec, _new_gec, top1_idx = limit_by_capacity( top1_idx, self.num_expert, self.world_size, capacity, group=self.group) valid_idx = top1_idx[top1_idx > -1] valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1]) fraction_expert = paddle.scatter_nd_add( x=paddle.zeros(shape=[self.tot_expert]), index=valid_idx_tmp, updates=paddle.ones_like( valid_idx, dtype=paddle.float32).reshape( shape=[len(valid_idx)]), ) / valid_idx.numel() prob_expert = score.sum(axis=0) / valid_idx.numel() loss = (fraction_expert * prob_expert).sum() * self.tot_expert self.set_loss(loss) return top1_score, top1_idx ================================================ FILE: ppfleetx/models/language_model/moe/moe_layer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/layers.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). import numpy as np import paddle import paddle.nn as nn from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate from .comm_ops import MoEScatter, MoEGather, AllGather, Slice from .utils import prepare_forward from paddle.distributed.fleet.utils import recompute from paddle.incubate.distributed.fleet import recompute_hybrid class MoELayer(nn.Layer): """MoE Layer Args: d_model: (int) model dimention experts: (list|nn.LayerList) expert networks list gate: (str|BaseGate|None): if gate is a str, it can only be "naive", "gshard", "switch" or None, default is "naive" else gate is an instance of BaseGate top_k: (int) default value is 2 moe_group: moe group for experts communication mp_group: mp group for mp commutication recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute. recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given. Examples: .. code-block:: python from paddle.nn import layer, LayerList from paddle.distributed.moe import MoElayer from paddle.distributed.collective import Group from paddle.distributed import fleet moe_group = Group(fleet.worker_index(), 0, list(range(fleet.worker_num()))) mp_group = None num_experts=8 dim_feedforward=512 d_model=8 top_k=2 class ExpertLayer(Layer): def __init__(self, d_model, d_hidden, name=None): super(ExpertLayer, self).__init__() self.htoh4 = nn.Linear(d_model, d_hidden) self.h4toh = nn.Linear(d_hidden, d_model) def forward(self, x): x = self.htoh4(x) x = self.h4toh(x) return x experts_list = LayerList() for expi in range(num_experts): exp_layer = ExpertLayer(d_model, dim_feedforward) experts_list.append(exp_layer) moeLayer = MoELayer(d_model = d_model, experts=experts_list, gate="gshard", top_k=2, moe_group=moe_group, mp_group=mp_group, recompute_interval=0) """ def __init__(self, d_model, experts, moe_group=None, mp_group=None, top_k=2, gate=None, recompute_interval=0, recompute_partition=False, recompute_offload=False): super(MoELayer, self).__init__() self.d_model = d_model assert experts is not None assert isinstance(experts, (list, nn.LayerList)), \ "The type of experts must be list or nn.LayerList" for i, exp in enumerate(experts): assert isinstance( exp, nn.Layer), "The type of experts[{}] must be nn.Layer".format(i) self.experts = nn.LayerList(experts) if isinstance(experts, list) else experts self.num_experts = len(experts) gate = "naive" if gate is None else gate assert isinstance(gate, (str, BaseGate)), \ "The type of gate must be str or an instance of BaseGate" self.top_k = top_k # only support mp/dp self.group = moe_group self.mp_group = mp_group self.world_size = self.group.nranks \ if self.group is not None else 1 if isinstance(gate, str): gate_map = { "naive": NaiveGate, "gshard": GShardGate, "switch": SwitchGate, } if gate in gate_map.keys(): self.gate = gate_map[gate](self.d_model, num_expert=self.num_expert, topk=self.top_k, group=self.group) else: assert False, "We only support naive gate, \ gshard gate and switch gate, \ but you choose {} gate.".format(gate) elif isinstance(gate, BaseGate): self.gate = gate else: raise TypeError("The type of gate must be either str in ('naive', \ 'gshard', 'switch') or an instance of moe.BaseGate") self.recompute_interval = recompute_interval self.recompute_ctx = { "mp_group": self.mp_group, "offload": recompute_offload, "partition": recompute_partition, } def forward(self, inp): origin_shape = inp.shape inp = inp.reshape_([-1, origin_shape[-1]]) mp_rank = 0 mp_size = 1 if self.mp_group is not None: mp_rank = self.mp_group.rank mp_size = self.mp_group.nranks if mp_size > 1: inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group) value, gate = self.gate(inp) ( pos, local_expert_count, global_expert_count, fwd_expert_count, fwd_batch_size, ) = prepare_forward(gate, self.num_expert, self.world_size, self.group) topk = 1 if len(gate.shape) == 2: topk = gate.shape[1] if pos.shape != [0]: temp_pos = pos // topk else: temp_pos = pos assert topk == self.top_k x = MoEScatter.apply(inp, temp_pos, local_expert_count, global_expert_count, fwd_batch_size, self.world_size, self.group) d_model = self.d_model def experts_fwd(x, fwd_expert_count, experts): if x.shape[0] == 0: return x y = [] last_index = 0 assert isinstance(fwd_expert_count, np.ndarray) assert len(experts) == len(fwd_expert_count) for idx, expert_count in enumerate(fwd_expert_count): if expert_count <= 0: continue y.append(experts[idx](x[last_index:expert_count + last_index])) last_index = expert_count + last_index return paddle.concat(y, axis=0) if self.recompute_interval <= 0 or x.shape[0] == 0: x = experts_fwd(x, fwd_expert_count.numpy(), self.experts) elif self.world_size > 1: x = recompute_hybrid(self.recompute_ctx, experts_fwd, x, fwd_expert_count.numpy(), self.experts) else: x = recompute(experts_fwd, x, fwd_expert_count.numpy(), self.experts) out_batch_size = inp.shape[0] if len(gate.shape) == 2: out_batch_size *= gate.shape[1] x = MoEGather.apply(x, pos, local_expert_count, global_expert_count, out_batch_size, self.world_size, self.group) x = x.reshape([-1, self.top_k, d_model]) value = value.reshape([x.shape[0], 1, self.top_k]) x = paddle.bmm(value, x).reshape([-1, d_model]) if mp_size > 1: x = AllGather.apply(x, mp_rank, mp_size, self.mp_group) x = paddle.reshape_(x, origin_shape) return x ================================================ FILE: ppfleetx/models/language_model/moe/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The file has been adapted from the file: # https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py # Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 # We retain the following license from the original files: # Copyright 2021, Jiaao He. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"). import paddle from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos def prepare_forward(gate, num_expert, world_size, moe_group): pos, local_expert_count, global_expert_count = count_by_gate( gate, num_expert, world_size, group=moe_group) with paddle.no_grad(): fwd_expert_count = global_expert_count.reshape_( [world_size, num_expert]).sum(axis=0) fwd_batch_size = int(fwd_expert_count.sum().item()) return ( pos, local_expert_count, global_expert_count, fwd_expert_count, fwd_batch_size, ) def _alltoall(in_tensor_list, group=None, use_calc_stream=True): if group is not None and not group.is_member(): return group = paddle.distributed.collective._get_default_group( ) if group is None else group out = paddle.empty(in_tensor_list.shape, in_tensor_list.dtype) task = group.process_group.alltoall(in_tensor_list, out) task.wait() return out def _local_scatter(inp, pos): if pos.shape != [0]: inp_buf = paddle.index_select(inp, pos, 0) else: inp_buf = paddle.empty([0, inp.shape[1]], dtype=inp.dtype) return inp_buf def _local_gather(inp, pos, out_batch_size, maybe_overlap=True): if pos.shape != [0]: origin_dtype = inp.dtype inp = paddle.cast(inp, dtype="float32") inp_buf = paddle.scatter( paddle.zeros( shape=[out_batch_size, inp.shape[-1]], dtype="float32"), pos, inp, overwrite=True) inp_buf = paddle.cast(inp_buf, dtype=origin_dtype) else: inp_buf = paddle.zeros( [out_batch_size, inp.shape[-1]], dtype=inp.dtype) return inp_buf def _all_gather(tensor, group=None, use_calc_stream=True): if group is not None and not group.is_member(): return group = paddle.distributed.collective._get_default_group( ) if group is None else group tensor_shape = list(tensor.shape) tensor_shape[0] *= group.nranks out = paddle.empty(tensor_shape, tensor.dtype) task = group.process_group.all_gather(tensor, out) task.wait() return out def count_by_gate(gate, num_expert, world_size, require_pos=True, group=None): total_expert_count = num_expert * world_size with paddle.no_grad(): local_expert_count = _number_count(gate, total_expert_count) if world_size > 1: global_expert_count = _alltoall(local_expert_count, group=group) else: global_expert_count = local_expert_count if not require_pos: pos = None else: lec_cum = paddle.cumsum(local_expert_count, axis=0) pos = _assign_pos(gate, lec_cum) return pos, local_expert_count, global_expert_count def limit_by_capacity(topk_idx, num_expert, world_size, capacity, group=None): with paddle.no_grad(): capacity = paddle.ones( shape=[num_expert], dtype=paddle.int64) * capacity pos, lec, gec = count_by_gate( topk_idx, num_expert, world_size, require_pos=False, group=group) new_gec = _limit_by_capacity(gec, capacity, world_size) if world_size > 1: assert group.nranks == world_size new_lec = _alltoall(new_gec, group=group) else: new_lec = new_gec topk_idx = _prune_gate_by_capacity(topk_idx, new_lec, num_expert, world_size) return new_lec, new_gec, topk_idx ================================================ FILE: ppfleetx/models/language_model/moe_exp/__init__.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/language_model/moe_exp/experts.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # The file has been adapted from a deepspeed file: # https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py # Git commit hash: a091bc223c01e94448f443456a6c15684644b966 # We retain the following license from the original files: # Copyright (c) The Microsoft DeepSpeed Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import copy class Experts(nn.Layer): def __init__(self, expert, num_local_experts=1, expert_group_name=None): super(Experts, self).__init__() self.fleetx_experts = nn.LayerList( [copy.deepcopy(expert) for i in range(num_local_experts)]) self.num_local_experts = num_local_experts # TODO: revisit allreduce for moe.gate... for expert in self.fleetx_experts: # TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group) for name, param in expert.named_parameters(): param.allreduce = False param.group_name = expert_group_name def forward(self, inputs): chunks = paddle.chunk(inputs, chunks=self.num_local_experts, axis=1) expert_outputs = [] for chunk, expert in zip(chunks, self.fleetx_experts): out = expert(chunk) if type(out) is tuple: out = out[0] # Ignore the bias term for now expert_outputs += [out] expert_output = paddle.concat(expert_outputs, axis=1) return expert_output ================================================ FILE: ppfleetx/models/language_model/moe_exp/layer.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # The file has been adapted from a deepspeed file: # https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/layer.py # Git commit hash: a091bc223c01e94448f443456a6c15684644b966 # We retain the following license from the original files: # Copyright (c) The Microsoft DeepSpeed Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from .experts import Experts from .sharded_moe import TopKGate, MOELayer class MoE(nn.Layer): def __init__(self, hidden_size, expert, num_experts=1, ep_size=1, k=1, capacity_factor=1., eval_capacity_factor=1., min_capacity=4, use_residual=False, noisy_gate_policy=None, drop_tokens=True, use_rts=False, enable_expert_tensor_parallelism=False): super(MoE, self).__init__() self.use_residual = use_residual self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism assert num_experts % ep_size == 0, f"Number of experts ({num_experts}) should be divisible by expert parallel size ({ep_size})" self.ep_size = ep_size self.expert_group_name = f"ep_size_{self.ep_size}" self.num_experts = num_experts self.num_local_experts = num_experts // self.ep_size # log_dist( # f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}', # [0]) assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ 'Unsupported noisy_gate_policy: ' + noisy_gate_policy experts = Experts(expert, self.num_local_experts, self.expert_group_name) self.gate = TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, min_capacity, noisy_gate_policy, drop_tokens, use_rts) self.fleetx_moe = MOELayer(self.gate, experts, self.expert_group_name, self.ep_size, self.num_local_experts) if self.use_residual: self.mlp = expert # coefficient is used for weighted sum of the output of expert and mlp self.coefficient = nn.Linear(hidden_size, 2) def forward(self, hidden_states, used_token=None): """ MoE forward Arguments: hidden_states (Tensor): input to the layer used_token (Tensor, optional): default: None, mask only used tokens Returns: A tuple including output, gate loss, and expert count. * output (Tensor): output of the model * l_aux (Tensor): gate loss value * exp_counts (int): expert count """ output = self.fleetx_moe(hidden_states, used_token) if self.use_residual: # Residual MoE output_mlp = self.mlp(hidden_states) if type(output_mlp) is tuple: output_mlp = output_mlp[0] # Ignore the bias term for now coef = self.coefficient(hidden_states) coef = F.softmax(coef, dim=-1) output = output * coef[..., 0:1] + output_mlp * coef[..., 1:] return output ================================================ FILE: ppfleetx/models/language_model/moe_exp/mappings.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # The file has been adapted from a deepspeed file: # https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/mappings.py # Git commit hash: a091bc223c01e94448f443456a6c15684644b966 # We retain the following license from the original files: # Copyright (c) The Microsoft DeepSpeed Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.distributed as dist from paddle.autograd import PyLayer #TODO: set axis for all_gather def _gather_tokens(input_, group, axis=0): """Gather tensors and concatenate them along a axisension""" # in case model is not deployed in distributed environment group = dist.collective._get_default_group() if group is None else group tensor_list = [paddle.empty_like(input_) for _ in range(group.nranks)] dist.all_gather(tensor_list, input_, group) output_ = paddle.concat(tensor_list, axis=axis) return output_ def _drop_tokens(input_, group, axis=0): """Divide a tensor among the tensor parallel ranks""" # in case model is not deployed in distributed environment group = dist.collective._get_default_group() if group is None else group total_chunks = group.nranks this_chunk = group.rank assert input_.shape[ axis] % total_chunks == 0, f"input dimention {axis} ({input_.shape[axis]}) is not divisible by tensor parallel world size ({total_chunks})" chunk_size = input_.shape[axis] // total_chunks return paddle.slice(input_, [axis], [this_chunk * chunk_size], [this_chunk * chunk_size + chunk_size]) class _GatherTokens(PyLayer): """All gather tokens among the tensor parallel ranks""" @staticmethod def forward(ctx, input_, group, axis): ctx.group = group ctx.axis = axis return _gather_tokens(input_, group, axis) @staticmethod def backward(ctx, grad_output): return _drop_tokens(grad_output, ctx.group, ctx.axis), None class _DropTokens(PyLayer): "Divide tokens equally among the tensor parallel ranks" @staticmethod def forward(ctx, input_, group, axis): ctx.group = group ctx.axis = axis return _drop_tokens(input_, axis) @staticmethod def backward(ctx, grad_output): return _gather_tokens(grad_output, ctx.group, ctx.axis), None def gather_tokens(input_, group=None, axis=0): if group is None or group.nranks == 1: # no tensor parallelism for non-experts return input_ return _GatherTokens.apply(input_, group, axis) def drop_tokens(input_, group=None, axis=0): if group is None or group.nranks == 1: # no tensor parallelism for non-experts return input_ return _DropTokens.apply(input_, group, axis) ================================================ FILE: ppfleetx/models/language_model/moe_exp/sharded_moe.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # The file has been adapted from a deepspeed file: # https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/sharded_moe.py # Git commit hash: a091bc223c01e94448f443456a6c15684644b966 # We retain the following license from the original files: # Copyright (c) The Microsoft DeepSpeed Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from typing import Callable, Dict, Tuple, Optional, Any from paddle.distribution import Uniform, Gumbel import paddle.nn.functional as F from paddle import Tensor import paddle.nn as nn import paddle.distributed as dist from paddle.autograd import PyLayer import paddle.distributed.fleet as fleet from .mappings import drop_tokens, gather_tokens uniform_map: Dict[str, Callable] = {} gumbel_map: Dict[str, Callable] = {} exp_selection_uniform_map: Dict[str, Callable] = {} def multiplicative_jitter(x, epsilon=1e-2): if epsilon == 0: return x device = paddle.get_device() uniform = uniform_map.get(device) if uniform is None: uniform = Uniform( low=paddle.to_tensor(1.0 - epsilon), high=paddle.to_tensor(1.0 + epsilon)).rsample # type: ignore uniform_map[device] = uniform return x * uniform(x.shape) def gumbel_rsample(shape): device = paddle.get_device() gumbel = gumbel_map.get(device) if gumbel is None: one = paddle.to_tensor(1.0) zero = paddle.to_tensor(0.0) gumbel = Gumbel(zero, one).rsample # type: ignore gumbel_map[device] = gumbel return gumbel(shape) # einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity # See https://arxiv.org/pdf/2006.16668.pdf for details. class _AllToAll(PyLayer): @staticmethod def forward(ctx: Any, group: dist.collective.Group, input: Tensor) -> Tensor: # type: ignore ctx.group = group output = paddle.empty_like(input) dist.alltoall_single(input, output, group=group) return output @staticmethod def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor]: return (None, _AllToAll.apply(ctx.group, *grad_output)) # einsum rewrites are on par or more performant # switch can be bubbled up in future USE_EINSUM = True # einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity # See https://arxiv.org/pdf/2006.16668.pdf for details. def einsum(rule, a, b): if USE_EINSUM: return paddle.einsum(rule, a, b) elif rule == 's,se->se': return a.reshape((a.shape[0], -1)) * b elif rule == 'se,sc->sec': return a.unsqueeze(2) * b.unsqueeze(1) elif rule == 'se,se->s': return paddle.bmm(paddle.unsqueeze(a, 1), paddle.unsqueeze(b, 2)).reshape((-1)) elif rule == 'sec,sm->ecm': s = a.shape[0] e = a.shape[1] c = a.shape[2] m = b.shape[1] return paddle.matmul(a.reshape((s, -1)).t(), b).reshape((e, c, m)) elif rule == 'sec,ecm->sm': return paddle.matmul( a.reshape((a.shape[0], -1)), b.reshape((-1, b.shape[-1]))) elif rule == 'ks,ksm->sm': k = b.shape[0] s = b.shape[1] m = b.shape[2] # [k, s] -> [s, k] -> [s, 1, k] a = a.t().unsqueeze(1) # [k,s,m] -> [k, sm] -> [sm, k] -> [s, m, k] b = b.reshape((k, -1)).t().reshape((s, m, k)) # bmm([s, 1, k], [s, m, k]^t) -> [s, m, 1] return paddle.bmm(a, b.transpose(1, 2)).squeeze(2) else: return paddle.einsum(rule, a, b) def _capacity(gates, capacity_factor, min_capacity): # gates has shape of SE num_tokens = gates.shape[0] num_experts = gates.shape[1] capacity = paddle.ceil( (num_tokens / num_experts) * capacity_factor).astype(paddle.int64) if capacity < min_capacity: capacity = min_capacity.astype(paddle.int64) return capacity def _top_idx(source, k): return paddle.topk(source, k=k, axis=0)[1] def top1gating(logits, capacity_factor: float, min_capacity: int, used_token: Tensor=None, noisy_gate_policy: Optional[str]=None, drop_tokens: bool=True, use_rts: bool=True) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top1Gating on logits.""" if noisy_gate_policy == 'RSample': logits_w_noise = logits + \ gumbel_rsample(logits.shape, device=logits.device) # everything is in fp32 in this function gates = F.softmax(logits, axis=1) capacity = _capacity(gates, paddle.to_tensor(capacity_factor), paddle.to_tensor(min_capacity)) # Create a mask for 1st's expert per token # noisy gating indices1_s = paddle.argmax( logits_w_noise if noisy_gate_policy == 'RSample' else gates, axis=1) num_experts = int(gates.shape[1]) assert(0 <= indices1_s.min() and indices1_s.max() < num_experts) mask1 = F.one_hot(indices1_s, num_classes=num_experts) # mask only used tokens if used_token is not None: mask1 = einsum("s,se->se", used_token, mask1) # gating decisions exp_counts = paddle.sum(mask1, axis=0).detach() # if we don't want to drop any tokens if not drop_tokens: new_capacity = paddle.max(exp_counts) # dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, # group=dist.get_world_group()) # capacity = new_capacity group = dist.collective._get_default_group() task = group.process_group.all_reduce(new_capacity, dist.ReduceOp.MAX) task.wait() # Compute l_aux me = paddle.mean(gates, axis=0) ce = paddle.mean(mask1.astype("float32"), axis=0) l_aux = paddle.sum(me * ce) * num_experts # Random Token Selection if use_rts: device = paddle.get_device() uniform = exp_selection_uniform_map.get(device) if uniform is None: uniform = Uniform( low=paddle.to_tensor(0.0), high=paddle.to_tensor(1.0)).rsample exp_selection_uniform_map[device] = uniform mask1_rand = mask1 * uniform(mask1.shape) else: mask1_rand = mask1 assert logits.shape[ 0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size." top_idx = _top_idx(mask1_rand, capacity) new_mask1 = paddle.zeros_like(mask1).put_along_axis_( indices=top_idx, values=1., axis=0) mask1 *= new_mask1 # Compute locations in capacity buffer with paddle.amp.auto_cast(False, level='O2'): locations1 = paddle.cumsum(mask1.astype(paddle.float32), axis=0) - 1 # Store the capacity location for each token locations1_s = paddle.sum(locations1 * mask1.astype(paddle.float32), axis=1) # Normalize gate probabilities mask1_float = mask1.astype("float32") gates = gates * mask1_float assert(0 <= locations1_s.astype(paddle.int32).min() and locations1_s.astype(paddle.int32).max() < capacity) locations1_sc = F.one_hot(locations1_s.astype(paddle.int32), capacity).astype(paddle.float32) combine_weights = einsum("se,sc->sec", gates, locations1_sc) dispatch_mask = combine_weights.astype("bool") return l_aux, combine_weights, dispatch_mask, exp_counts def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top2Gating on logits.""" # everything is in fp32 in this function gates = F.softmax(logits, axis=1) capacity = _capacity(gates, paddle.to_tensor(capacity_factor * 2), paddle.to_tensor(min_capacity)) # Create a mask for 1st's expert per token indices1_s = paddle.argmax(gates, axis=1) num_experts = int(gates.shape[1]) mask1 = F.one_hot(indices1_s, num_classes=num_experts) # Create a mask for 2nd's expert per token using Gumbel-max trick # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/ logits_w_noise = logits + gumbel_rsample(logits.shape) # Replace top-expert with min value # logits_except1 = logits_w_noise.masked_fill(mask1.astype("bool"), float("-inf")) logits_except1 = paddle.where( mask1.astype("bool"), paddle.ones(logits_w_noise.shape) * float("-inf"), logits_w_noise) indices2_s = paddle.argmax(logits_except1, axis=1) mask2 = F.one_hot(indices2_s, num_classes=num_experts) # Compute locations in capacity buffer locations1 = paddle.cumsum(mask1, axis=0) - 1 locations2 = paddle.cumsum(mask2, axis=0) - 1 # Update 2nd's location by accounting for locations of 1st locations2 += paddle.sum(mask1, axis=0, keepdim=True) # gating decisions exp_counts = paddle.sum(mask1, axis=0).detach() # Compute l_aux me = paddle.mean(gates, axis=0) ce = paddle.mean(mask1.astype("float32"), axis=0) l_aux = paddle.mean(me * ce) * num_experts * num_experts # Remove locations outside capacity from mask mask1 *= paddle.less_than(locations1, capacity) mask2 *= paddle.less_than(locations2, capacity) # Store the capacity location for each token locations1_s = paddle.sum(locations1 * mask1, axis=1) locations2_s = paddle.sum(locations2 * mask2, axis=1) # Normalize gate probabilities mask1_float = mask1.astype("float32") mask2_float = mask2.astype("float32") gates1_s = einsum("se,se->s", gates, mask1_float) gates2_s = einsum("se,se->s", gates, mask2_float) denom_s = gates1_s + gates2_s # Avoid divide-by-zero # HACK: paddle currently does not support finfo, use constant instead min_constant = 1.1920928955078125e-07 denom_s = paddle.clip(denom_s, min=min_constant) gates1_s /= denom_s gates2_s /= denom_s # Calculate combine_weights and dispatch_mask gates1 = einsum("s,se->se", gates1_s, mask1_float) gates2 = einsum("s,se->se", gates2_s, mask2_float) locations1_sc = F.one_hot(locations1_s, capacity) locations2_sc = F.one_hot(locations2_s, capacity) combine1_sec = einsum("se,sc->sec", gates1, locations1_sc) combine2_sec = einsum("se,sc->sec", gates2, locations2_sc) combine_weights = combine1_sec + combine2_sec dispatch_mask = combine_weights.astype("bool") return l_aux, combine_weights, dispatch_mask, exp_counts class TopKGate(nn.Layer): """Gate module which implements Top2Gating as described in Gshard_. :: gate = TopKGate(model_dim, num_experts) l_aux, combine_weights, dispatch_mask = gate(input) .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf Args: model_dim (int): size of model embedding dimension num_experts (ints): number of experts in model """ wg: nn.Linear def __init__(self, model_dim: int, num_experts: int, k: int=1, capacity_factor: float=1.0, eval_capacity_factor: float=1.0, min_capacity: int=8, noisy_gate_policy: Optional[str]=None, drop_tokens: bool=True, use_rts: bool=True) -> None: super().__init__() # Only top-1 and top-2 are supported at the moment. if k != 1 and k != 2: raise ValueError('Only top-1 and top-2 gatings are supported.') self.wg = nn.Linear(model_dim, num_experts).to(dtype=paddle.float32) self.k = k self.capacity_factor = capacity_factor self.eval_capacity_factor = eval_capacity_factor self.min_capacity = min_capacity self.noisy_gate_policy = noisy_gate_policy # self.timers = SynchronizedWallClockTimer() self.wall_clock_breakdown = False self.gate_time = 0.0 self.drop_tokens = drop_tokens self.use_rts = use_rts def forward(self, input: paddle.Tensor, used_token: paddle.Tensor=None ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore # if self.wall_clock_breakdown: # self.timers('TopKGate').start() if self.wg.weight.dtype != paddle.float32: self.wg = self.wg.to(dtype=paddle.float32) input_fp32 = input.astype("float32") # input jittering if self.noisy_gate_policy == 'Jitter' and self.training: input_fp32 = multiplicative_jitter(input_fp32) logits = self.wg(input_fp32) if self.k == 1: gate_output = top1gating( logits, self.capacity_factor if self.training else self.eval_capacity_factor, self.min_capacity, used_token, self.noisy_gate_policy if self.training else None, self.drop_tokens, self.use_rts) else: gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, self.min_capacity) # if self.wall_clock_breakdown: # self.timers('TopKGate').stop() # self.gate_time = self.timers('TopKGate').elapsed(reset=False) return gate_output class MOELayer(nn.Layer): def __init__(self, gate: nn.Layer, experts: nn.Layer, ep_group_name, ep_size, num_local_experts: int) -> None: super().__init__() self.gate = gate self.experts = experts self.ep_group = None self.ep_size = ep_size self.ep_group_name = ep_group_name self.num_local_experts = num_local_experts self.time_falltoall = 0.0 self.time_salltoall = 0.0 self.time_moe = 0.0 # self.timers = SynchronizedWallClockTimer() self.wall_clock_breakdown = False #HACK need fix # self.hcg = fleet.get_hybrid_communicate_group() self.hcg = None def _set_ep_group(self, ep_group): self.ep_group = ep_group def get_loss(self): return self.l_aux def forward(self, *input: Tensor, **kwargs: Any) -> Tensor: # if self.wall_clock_breakdown: # self.timers('moe').start() # Implement Algorithm 2 from GShard paper. d_model = input[0].shape[-1] # Initial implementation -> Reshape into S tokens by dropping sequence dimension. # Reshape into G groups so that each group can distribute tokens equally # group_size = kwargs['group_size'] if 'group_size' in kwargs.keys() else 1 reshaped_input = input[0].reshape((-1, d_model)) self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate( reshaped_input, input[1]) dispatched_input = einsum("sec,sm->ecm", dispatch_mask.astype(input[0].dtype), reshaped_input) # if self.wall_clock_breakdown: # self.timers('falltoall').start() # HACK: _get_expert_model_parallel_world_size is needed here if False and self.hcg.get_model_parallel_group().nranks == 1: # If the non-expert is tensor-parallel, it will create # duplicate tokens on the tensor-parallel ranks. # Since our experts are not tensor-parallel, these duplicates # need to be dropped to ensure correctness. # this also doubles up as a communication optimization as we are # reducing the all-to-all communication volume. dispatched_input = drop_tokens(dispatched_input, axis=1) # HACK disable AllToAll # dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input) # if self.wall_clock_breakdown: # self.timers('falltoall').stop() # self.time_falltoall = self.timers('falltoall').elapsed(reset=False) # Re-shape after all-to-all: ecm -> gecm dispatched_input = dispatched_input.reshape( (self.ep_size, self.num_local_experts, -1, d_model)) expert_output = self.experts(dispatched_input) # if self.wall_clock_breakdown: # self.timers('salltoall').start() # HACK disable AllToAll # expert_output = _AllToAll.apply(self.ep_group, expert_output) # if self.wall_clock_breakdown: # self.timers('salltoall').stop() # self.time_salltoall = self.timers('salltoall').elapsed(reset=False) # Re-shape back: gecm -> ecm expert_output = expert_output.reshape( (self.ep_size * self.num_local_experts, -1, d_model)) # HACK: _get_expert_model_parallel_world_size is needed here if False and self.hcg.get_model_parallel_group().nranks == 1: # the dropped duplicate tokens need to be gathered on each # tensor parallel rank again for the tensor-parallel # non-expert of the next layer. expert_output = gather_tokens(expert_output, axis=1) combined_output = einsum("sec,ecm->sm", combine_weights.astype(input[0].dtype), expert_output) a = combined_output.reshape((input[0].shape)) # if self.wall_clock_breakdown: # self.timers('moe').stop() # self.time_moe = self.timers('moe').elapsed(reset=False) return a ================================================ FILE: ppfleetx/models/language_model/t5/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .modeling import (finfo, ACT2FN, ModelOutput, get_t5_model, t5_encode_text, get_encoded_dim) from .utils import normal_, constant_init ================================================ FILE: ppfleetx/models/language_model/t5/modeling.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import copy import json import numpy as np from collections import OrderedDict from typing import Callable, List, Optional, Set, Tuple, Union, Any import paddle from paddle import nn from ppfleetx.data.tokenizers.t5_tokenizer import ( t5_tokenize, get_t5_tokenizer, DEFAULT_T5_NAME) from ppfleetx.models.multimodal_model.imagen.utils import rearrange, exists, default def finfo(dtype): if dtype == paddle.float32: return np.finfo(np.float32) if dtype == paddle.float16: return np.finfo(np.float16) if dtype == paddle.float64: return np.finfo(np.float64) def fields(class_or_instance): """Return a tuple describing the fields of this dataclass. Accepts a dataclass or an instance of one. Tuple elements are of type Field. """ # Might it be worth caching this, per class? try: fields = getattr(class_or_instance, _FIELDS) except AttributeError: raise TypeError('must be called with a dataclass type or instance') # Exclude pseudo-fields. Note that fields is sorted by insertion # order, so the order of the tuple is as the fields were defined. return tuple(f for f in fields.values() if f._field_type is _FIELD) def is_tensor(x): return isinstance(x, paddle.Tensor) class ModelOutput(OrderedDict): """ Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular python dictionary. You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple before. """ def __post_init__(self): class_fields = fields(self) # Safety and consistency checks if not len(class_fields): raise ValueError(f"{self.__class__.__name__} has no fields.") if not all(field.default is None for field in class_fields[1:]): raise ValueError( f"{self.__class__.__name__} should not have more than one required field." ) first_field = getattr(self, class_fields[0].name) other_fields_are_none = all( getattr(self, field.name) is None for field in class_fields[1:]) if other_fields_are_none and not is_tensor(first_field): if isinstance(first_field, dict): iterator = first_field.items() first_field_iterator = True else: try: iterator = iter(first_field) first_field_iterator = True except TypeError: first_field_iterator = False # if we provided an iterator as first field and the iterator is a (key, value) iterator # set the associated fields if first_field_iterator: for element in iterator: if (not isinstance(element, (list, tuple)) or not len(element) == 2 or not isinstance(element[0], str)): break setattr(self, element[0], element[1]) if element[1] is not None: self[element[0]] = element[1] elif first_field is not None: self[class_fields[0].name] = first_field else: for field in class_fields: v = getattr(self, field.name) if v is not None: self[field.name] = v def __delitem__(self, *args, **kwargs): raise Exception( f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance." ) def setdefault(self, *args, **kwargs): raise Exception( f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance." ) def pop(self, *args, **kwargs): raise Exception( f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") def update(self, *args, **kwargs): raise Exception( f"You cannot use ``update`` on a {self.__class__.__name__} instance." ) def __getitem__(self, k): if isinstance(k, str): inner_dict = {k: v for (k, v) in self.items()} return inner_dict[k] else: return self.to_tuple()[k] def __setattr__(self, name, value): if name in self.keys() and value is not None: # Don't call self.__setitem__ to avoid recursion errors super().__setitem__(name, value) super().__setattr__(name, value) def __setitem__(self, key, value): # Will raise a KeyException if needed super().__setitem__(key, value) # Don't call self.__setattr__ to avoid recursion errors super().__setattr__(key, value) def to_tuple(self) -> Tuple[Any]: """ Convert self to a tuple containing all the attributes/keys that are not `None`. """ return tuple(self[k] for k in self.keys()) class NewGELUActivation(nn.Layer): """ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ def forward(self, input): return 0.5 * input * (1.0 + paddle.tanh( math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0)))) class GELUActivation(nn.Layer): """ Original Implementation of the GELU activation function in Google BERT repo when initially created. For information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))) This is now written in C in nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ def __init__(self, use_gelu_python: bool=False): super().__init__() self.act = nn.functional.gelu def _gelu_python(self, input): return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0))) def forward(self, input): return self.act(input) class FastGELUActivation(nn.Layer): """ Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs """ def forward(self, input): return 0.5 * input * ( 1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) class QuickGELUActivation(nn.Layer): """ Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs """ def forward(self, input): return input * paddle.nn.functional.sigmoid(1.702 * input) class ClippedGELUActivation(nn.Layer): """ Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to https://arxiv.org/abs/2004.09602. Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + paddle.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * paddle.pow(x, 3)))). See https://arxiv.org/abs/1606.08415 """ def __init__(self, min: float, max: float): if min > max: raise ValueError( f"min should be < max (got min: {min}, max: {max})") super().__init__() self.min = min self.max = max def forward(self, x): return paddle.clip(gelu(x), self.min, self.max) class SiLUActivation(nn.Layer): """ See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with later. """ def __init__(self): super().__init__() self.act = nn.functional.silu def _silu_python(self, input): return input * nn.functional.sigmoid(input) def forward(self, input): return self.act(input) class MishActivation(nn.Layer): """ See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also visit the official repository for the paper: https://github.com/digantamisra98/Mish """ def __init__(self): super().__init__() self.act = nn.functional.mish def _mish_python(self, input): return input * paddle.tanh(nn.functional.softplus(input)) def forward(self, input): return self.act(input) class LinearActivation(nn.Layer): """ Applies the linear activation function, i.e. forwarding input directly to output. """ def forward(self, input): return input ACT2FN = { "gelu": GELUActivation(), "gelu_10": ClippedGELUActivation(-10, 10), "gelu_fast": FastGELUActivation(), "gelu_new": NewGELUActivation(), "gelu_python": GELUActivation(use_gelu_python=True), "linear": LinearActivation(), "mish": MishActivation(), "quick_gelu": QuickGELUActivation(), "relu": nn.ReLU(), "sigmoid": nn.Sigmoid(), "silu": SiLUActivation(), "swish": SiLUActivation(), "tanh": nn.Tanh(), } def get_activation(activation_string): if activation_string in ACT2FN: return ACT2FN[activation_string] else: raise KeyError( f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}" ) # For backwards compatibility with: from activations import gelu_python gelu_python = get_activation("gelu_python") gelu_new = get_activation("gelu_new") gelu = get_activation("gelu") gelu_fast = get_activation("gelu_fast") quick_gelu = get_activation("quick_gelu") silu = get_activation("silu") mish = get_activation("mish") linear_act = get_activation("linear") def prune_linear_layer(layer: nn.Linear, index: paddle.int64, dim: int=0) -> nn.Linear: """ Prune a linear layer to keep only entries in index. Used to remove heads. Args: layer (`paddle.nn.Linear`): The layer to prune. index (`paddle.int64`): The indices to keep in the layer. dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices. Returns: `paddle.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`. """ W = layer.weight.index_select(dim, index).clone().detach() if layer.bias is not None: if dim == 1: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = nn.Linear( new_size[1], new_size[0], bias_attr=layer.bias is not None) new_layer.weight.requires_grad = False new_layer.weight.copy_(W) new_layer.weight.stop_gradient = False if layer.bias is not None: new_layer.bias.stop_gradient = True new_layer.bias.copy_(b) new_layer.bias.stop_gradient = False return new_layer def find_pruneable_heads_and_indices(heads, n_heads: int, head_size: int, already_pruned_heads): """ Finds the heads and their indices taking `already_pruned_heads` into account. Args: heads : List of the indices of heads to prune. n_heads : The number of heads in the model. head_size : The size of each head. already_pruned_heads : A set of already pruned heads. Returns: A tuple with the remaining heads and their corresponding indices. """ mask = paddle.ones(n_heads, head_size) heads = set( heads ) - already_pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in already_pruned_heads) mask[head] = 0 mask = mask.reshape(-1).equal(1) index = paddle.arange(len(mask))[mask].cast(paddle.int64) return heads, index class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): """ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Args: last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output. past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. """ last_hidden_state = None past_key_values = None hidden_states = None attentions = None cross_attentions = None class T5Config(object): def __init__(self, **kwargs): # Fine-tuning task arguments self.architectures = kwargs.pop("architectures", None) self.use_return_dict = kwargs.pop("return_dict", True) self.d_ff = kwargs.pop("d_ff", None) self.d_kv = kwargs.pop("d_kv", None) self.d_model = kwargs.pop("d_model", None) self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) self.dense_act_fn = kwargs.pop("dense_act_fn", 'gelu_new') self.eos_token_id = kwargs.pop("eos_token_id", None) self.feed_forward_proj = kwargs.pop("feed_forward_proj", None) self.initializer_factor = kwargs.pop("initializer_factor", None) self.is_decoder = kwargs.pop("is_decoder", False) self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) self.is_gated_act = kwargs.pop("is_gated_act", True) self.layer_norm_epsilon = kwargs.pop("layer_norm_epsilon", None) self.model_type = kwargs.pop("model_type", None) self.num_decoder_layers = kwargs.pop("num_decoder_layers", None) self.num_heads = kwargs.pop("num_heads", None) self.num_layers = kwargs.pop("num_layers", None) self.output_past = kwargs.pop("output_past", True) self.pad_token_id = kwargs.pop("pad_token_id", None) self.relative_attention_max_distance = kwargs.pop( "relative_attention_max_distance", 128) self.relative_attention_num_buckets = kwargs.pop( "relative_attention_num_buckets", None) self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", False) self.transformers_version = kwargs.pop("transformers_version", None) self.use_cache = kwargs.pop("use_cache", False) self.vocab_size = kwargs.pop("vocab_size", None) self.model_type = kwargs.pop("model_type", None) self.dropout_rate = kwargs.pop("dropout_rate", None) self.output_attentions = kwargs.pop("output_attentions", False) self.output_hidden_states = kwargs.pop("output_hidden_states", False) class T5LayerNorm(nn.Layer): def __init__(self, hidden_size, eps=1e-6): super().__init__() """ Construct a layernorm module in the T5 style. No bias and no subtraction of mean. """ super().__init__() self.weight = self.create_parameter( [hidden_size], default_initializer=nn.initializer.Constant(value=1.)) self.variance_epsilon = eps def forward(self, hidden_states): # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 variance = hidden_states.cast(paddle.float32).pow(2).mean( -1, keepdim=True) hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon) # convert into half-precision if necessary if self.weight.dtype in [paddle.float16, paddle.bfloat16]: hidden_states = hidden_states.cast(self.weight.dtype) return self.weight * hidden_states class T5DenseActDense(nn.Layer): def __init__(self, d_model, d_ff, dropout_rate, dense_act_fn): super().__init__() self.wi = nn.Linear(d_model, d_ff, bias_attr=False) self.wo = nn.Linear(d_ff, d_model, bias_attr=False) self.dropout = nn.Dropout(dropout_rate) self.act = ACT2FN[dense_act_fn] def forward(self, hidden_states): hidden_states = self.wi(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.wo(hidden_states) return hidden_states class T5DenseGatedActDense(nn.Layer): def __init__(self, d_model, d_ff, dropout_rate, dense_act_fn): super().__init__() self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False) self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False) self.wo = nn.Linear(d_ff, d_model, bias_attr=False) self.dropout = nn.Dropout(dropout_rate) self.act = ACT2FN[dense_act_fn] def forward(self, hidden_states): hidden_gelu = self.act(self.wi_0(hidden_states)) hidden_linear = self.wi_1(hidden_states) hidden_states = hidden_gelu * hidden_linear hidden_states = self.dropout(hidden_states) hidden_states = self.wo(hidden_states) return hidden_states class T5LayerFF(nn.Layer): def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon, feed_forward_proj): super().__init__() if feed_forward_proj == "gated-gelu": self.DenseReluDense = T5DenseGatedActDense( d_model, d_ff, dropout_rate, dense_act_fn) elif feed_forward_proj == "relu": self.DenseReluDense = T5DenseActDense(d_model, d_ff, dropout_rate, feed_forward_proj) self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) def forward(self, hidden_states): forwarded_states = self.layer_norm(hidden_states) forwarded_states = self.DenseReluDense(forwarded_states) hidden_states = hidden_states + self.dropout(forwarded_states) return hidden_states class T5Attention(nn.Layer): def __init__(self, is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, dropout_rate, has_relative_attention_bias=False): super().__init__() self.is_decoder = is_decoder self.has_relative_attention_bias = has_relative_attention_bias self.relative_attention_num_buckets = relative_attention_num_buckets self.d_model = d_model self.key_value_proj_dim = d_kv self.n_heads = num_heads self.dropout = dropout_rate self.inner_dim = self.n_heads * self.key_value_proj_dim # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.d_model, self.inner_dim, bias_attr=False) self.k = nn.Linear(self.d_model, self.inner_dim, bias_attr=False) self.v = nn.Linear(self.d_model, self.inner_dim, bias_attr=False) self.o = nn.Linear(self.inner_dim, self.d_model, bias_attr=False) if self.has_relative_attention_bias: self.relative_attention_bias = nn.Embedding( self.relative_attention_num_buckets, self.n_heads) self.pruned_heads = set() self.gradient_checkpointing = False def prune_heads(self, heads): if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices( heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads) # Prune linear layers self.q = prune_linear_layer(self.q, index) self.k = prune_linear_layer(self.k, index) self.v = prune_linear_layer(self.v, index) self.o = prune_linear_layer(self.o, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.inner_dim = self.key_value_proj_dim * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) @staticmethod def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 Translate relative position to a bucket number for relative attention. The relative position is defined as memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should allow for more graceful generalization to longer sequences than the model has been trained on Args: relative_position: an int32 Tensor bidirectional: a boolean - whether the attention is bidirectional num_buckets: an integer max_distance: an integer Returns: a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) """ relative_buckets = 0 if bidirectional: num_buckets //= 2 relative_buckets += ( relative_position > 0).cast(paddle.int64) * num_buckets relative_position = paddle.abs(relative_position) else: relative_position = -paddle.min( relative_position, paddle.zeros_like(relative_position)) # now relative_position is in the range [0, inf) # half of the buckets are for exact increments in positions max_exact = num_buckets // 2 is_small = relative_position < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance relative_position_if_large = max_exact + ( paddle.log(relative_position.cast('float32') / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)).cast(paddle.int64) relative_position_if_large = paddle.minimum( relative_position_if_large, paddle.full_like(relative_position_if_large, num_buckets - 1)) relative_buckets += paddle.where(is_small, relative_position, relative_position_if_large) return relative_buckets def compute_bias(self, query_length, key_length, device=None): """Compute binned relative position bias""" context_position = paddle.arange( query_length, dtype=paddle.int64)[:, None] memory_position = paddle.arange( key_length, dtype=paddle.int64)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( relative_position, # shape (query_length, key_length) bidirectional=(not self.is_decoder), num_buckets=self.relative_attention_num_buckets, ) values = self.relative_attention_bias( relative_position_bucket ) # shape (query_length, key_length, num_heads) values = values.transpose([2, 0, 1]).unsqueeze( 0) # shape (1, num_heads, query_length, key_length) return values def forward( self, hidden_states, mask=None, key_value_states=None, position_bias=None, past_key_value=None, layer_head_mask=None, query_length=None, use_cache=False, output_attentions=False, ): """ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length) # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head) batch_size, seq_length = hidden_states.shape[:2] real_seq_length = seq_length if past_key_value is not None: assert ( len(past_key_value) == 2 ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" real_seq_length += past_key_value[0].shape[ 2] if query_length is None else query_length key_length = real_seq_length if key_value_states is None else key_value_states.shape[ 1] def shape(states): """projection""" return states.reshape( [0, -1, self.n_heads, self.key_value_proj_dim]).transpose( [0, 2, 1, 3]) def unshape(states): """reshape""" return states.transpose([0, 2, 1, 3]).reshape( [batch_size, -1, self.inner_dim]) def project(hidden_states, proj_layer, key_value_states, past_key_value): """projects hidden states correctly to key/query states""" if key_value_states is None: # self-attn # (batch_size, n_heads, seq_length, dim_per_head) hidden_states = shape(proj_layer(hidden_states)) elif past_key_value is None: # cross-attn # (batch_size, n_heads, seq_length, dim_per_head) hidden_states = shape(proj_layer(key_value_states)) if past_key_value is not None: if key_value_states is None: # self-attn # (batch_size, n_heads, key_length, dim_per_head) hidden_states = paddle.concat( [past_key_value, hidden_states], axis=2) else: # cross-attn hidden_states = past_key_value return hidden_states # get query states query_states = shape(self.q( hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) # get key/value states key_states = project(hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None) value_states = project(hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None) # compute scores scores = paddle.matmul( query_states, key_states.transpose([0, 1, 3, 2]) ) # equivalent of paddle.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 if position_bias is None: if not self.has_relative_attention_bias: position_bias = paddle.zeros( (1, self.n_heads, real_seq_length, key_length), dtype=scores.dtype) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: position_bias = self.compute_bias(real_seq_length, key_length) # if key and values are already calculated # we want only the last query position bias if past_key_value is not None: position_bias = position_bias[:, :, -hidden_states.size(1):, :] if mask is not None: position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) scores += position_bias attn_weights = nn.functional.softmax( scores.cast('float32'), axis=-1).astype( scores.dtype) # (batch_size, n_heads, seq_length, key_length) attn_weights = nn.functional.dropout( attn_weights, p=self.dropout, training=self. training) # (batch_size, n_heads, seq_length, key_length) # Mask heads if we want to if layer_head_mask is not None: attn_weights = attn_weights * layer_head_mask attn_output = unshape(paddle.matmul( attn_weights, value_states)) # (batch_size, seq_length, dim) attn_output = self.o(attn_output) present_key_value_state = (key_states, value_states) if ( self.is_decoder and use_cache) else None outputs = (attn_output, ) + (present_key_value_state, ) + ( position_bias, ) if output_attentions: outputs = outputs + (attn_weights, ) return outputs class T5LayerSelfAttention(nn.Layer): def __init__(self, is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon, has_relative_attention_bias=False): super().__init__() self.SelfAttention = T5Attention( is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, dropout_rate, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) def forward( self, hidden_states, attention_mask=None, position_bias=None, layer_head_mask=None, past_key_value=None, use_cache=False, output_attentions=False, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.SelfAttention( normed_hidden_states, mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = hidden_states + self.dropout(attention_output[0]) outputs = (hidden_states, ) + attention_output[1:] # add attentions if we output them return outputs class T5LayerCrossAttention(nn.Layer): def __init__(self, is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon): super().__init__() self.EncDecAttention = T5Attention( is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, has_relative_attention_bias=False) self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) def forward( self, hidden_states, key_value_states, attention_mask=None, position_bias=None, layer_head_mask=None, past_key_value=None, use_cache=False, query_length=None, output_attentions=False, ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.EncDecAttention( normed_hidden_states, mask=attention_mask, key_value_states=key_value_states, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=past_key_value, use_cache=use_cache, query_length=query_length, output_attentions=output_attentions, ) layer_output = hidden_states + self.dropout(attention_output[0]) outputs = (layer_output, ) + attention_output[1:] # add attentions if we output them return outputs class T5Block(nn.Layer): def __init__(self, is_decoder, relative_attention_num_buckets, feed_forward_proj, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon, d_ff, has_relative_attention_bias=False): super().__init__() self.is_decoder = is_decoder self.layer = nn.LayerList() self.layer.append( T5LayerSelfAttention( is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon, has_relative_attention_bias=has_relative_attention_bias)) if self.is_decoder: self.layer.append( T5LayerCrossAttention( is_decoder, relative_attention_num_buckets, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon)) self.layer.append( T5LayerFF(d_model, d_ff, dropout_rate, layer_norm_epsilon, feed_forward_proj)) def forward( self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, layer_head_mask=None, cross_attn_layer_head_mask=None, past_key_value=None, use_cache=False, output_attentions=False, return_dict=True, ): if past_key_value is not None: if not self.is_decoder: logger.warning( "`past_key_values` is passed to the encoder. Please make sure this is intended." ) expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 if len(past_key_value) != expected_num_past_key_values: raise ValueError( f"There should be {expected_num_past_key_values} past states. " f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}" f"Got {len(past_key_value)} past key / value states") self_attn_past_key_value = past_key_value[:2] cross_attn_past_key_value = past_key_value[2:] else: self_attn_past_key_value, cross_attn_past_key_value = None, None self_attention_outputs = self.layer[0]( hidden_states, attention_mask=attention_mask, position_bias=position_bias, layer_head_mask=layer_head_mask, past_key_value=self_attn_past_key_value, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states, present_key_value_state = self_attention_outputs[:2] attention_outputs = self_attention_outputs[ 2:] # Keep self-attention outputs and relative position weights # clamp inf values to enable fp16 training if hidden_states.dtype == paddle.float16 and paddle.isinf( hidden_states).any(): clamp_value = finfo(hidden_states.dtype).max - 1000 hidden_states = paddle.clip( hidden_states, min=-clamp_value, max=clamp_value) do_cross_attention = self.is_decoder and encoder_hidden_states is not None if do_cross_attention: # the actual query length is unknown for cross attention # if using past key value states. Need to inject it here if present_key_value_state is not None: query_length = present_key_value_state[0].shape[2] else: query_length = None cross_attention_outputs = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attention_mask, position_bias=encoder_decoder_position_bias, layer_head_mask=cross_attn_layer_head_mask, past_key_value=cross_attn_past_key_value, query_length=query_length, use_cache=use_cache, output_attentions=output_attentions, ) hidden_states = cross_attention_outputs[0] # clamp inf values to enable fp16 training if hidden_states.dtype == paddle.float16 and paddle.isinf( hidden_states).any(): clamp_value = finfo(hidden_states.dtype).max - 1000 hidden_states = paddle.clip( hidden_states, min=-clamp_value, max=clamp_value) # Combine self attn and cross attn key value states if present_key_value_state is not None: present_key_value_state = present_key_value_state + cross_attention_outputs[ 1] # Keep cross-attention outputs and relative position weights attention_outputs = attention_outputs + cross_attention_outputs[2:] # Apply Feed Forward layer hidden_states = self.layer[-1](hidden_states) # clamp inf values to enable fp16 training if hidden_states.dtype == paddle.float16 and paddle.isinf( hidden_states).any(): clamp_value = finfo(hidden_states.dtype).max - 1000 hidden_states = paddle.clip( hidden_states, min=-clamp_value, max=clamp_value) outputs = (hidden_states, ) if use_cache: outputs = outputs + (present_key_value_state, ) + attention_outputs else: outputs = outputs + attention_outputs return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) class T5Stack(nn.Layer): def __init__(self, d_model, num_layers, layer_norm_epsilon, dropout_rate, relative_attention_num_buckets, feed_forward_proj, d_kv, num_heads, d_ff, embed_tokens=None, is_decoder=False): super().__init__() self.embed_tokens = embed_tokens self.is_decoder = is_decoder self.num_layers = num_layers self.block = nn.LayerList([ T5Block( is_decoder, relative_attention_num_buckets, feed_forward_proj, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon, d_ff, has_relative_attention_bias=bool(i == 0)) for i in range(num_layers) ]) self.final_layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings def get_extended_attention_mask(self, attention_mask, input_shape): """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: attention_mask (`paddle.Tensor`): Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (`Tuple[int]`): The shape of the input to the model. Returns: `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. """ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. #extended_attention_mask = extended_attention_mask.cast(dtype='float16') # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False): """ Prepare the head mask if needed. Args: head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). num_hidden_layers (`int`): The number of hidden layers in the model. is_attention_chunked: (`bool`, *optional*, defaults to `False`): Whether or not the attentions scores are computed by chunks or not. Returns: `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with `[None]` for each layer. """ if head_mask is not None: head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: head_mask = [None] * num_hidden_layers return head_mask def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( -1).unsqueeze(-1) head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( -1) # We can specify head_mask for each layer assert head_mask.dim( ) == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" #head_mask = head_mask.cast(dtype=self.dtype) # switch to float if need + fp16 compatibility return head_mask def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, inputs_embeds=None, head_mask=None, cross_attn_head_mask=None, past_key_values=None, use_cache=False, output_attentions=False, output_hidden_states=False, return_dict=True, ): if use_cache is True: assert ( self.is_decoder ), f"`use_cache` can only be set to `True` if {self} is used as a decoder" output_hidden_states = (output_hidden_states if output_hidden_states is not None else False) if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError( f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.shape input_ids = input_ids.reshape([-1, input_shape[-1]]) elif inputs_embeds is not None: input_shape = inputs_embeds.shape[:-1] else: err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError( f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds" ) if inputs_embeds is None: assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings" inputs_embeds = self.embed_tokens(input_ids) batch_size, seq_length = input_shape # required mask seq length can be calculated via length of past mask_seq_length = past_key_values[0][0].shape[ 2] + seq_length if past_key_values is not None else seq_length if use_cache is True: assert self.is_decoder, f"`use_cache` can only be set to `True` if {self} is used as a decoder" if attention_mask is None: attention_mask = paddle.ones(batch_size, mask_seq_length) if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = paddle.ones( batch_size, encoder_seq_length, dtype=paddle.int64) # initialize past_key_values with `None` if past does not exist if past_key_values is None: past_key_values = [None] * len(self.block) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask = self.get_extended_attention_mask( attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = paddle.ones(encoder_hidden_shape) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.num_layers) cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.num_layers) present_key_value_states = () if use_cache else None all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None all_cross_attentions = () if (output_attentions and self.is_decoder) else None position_bias = None encoder_decoder_position_bias = None hidden_states = self.dropout(inputs_embeds) for i, (layer_module, past_key_value ) in enumerate(zip(self.block, past_key_values)): layer_head_mask = head_mask[i] cross_attn_layer_head_mask = cross_attn_head_mask[i] if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) layer_outputs = layer_module( hidden_states, attention_mask=extended_attention_mask, position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, layer_head_mask=layer_head_mask, cross_attn_layer_head_mask=cross_attn_layer_head_mask, past_key_value=past_key_value, use_cache=use_cache, output_attentions=output_attentions, ) # layer_outputs is a tuple with: # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights) if use_cache is False: layer_outputs = layer_outputs[:1] + (None, ) + layer_outputs[1:] hidden_states, present_key_value_state = layer_outputs[:2] # We share the position biases between the layers - the first layer store them # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights), # (cross-attention position bias), (cross-attention weights) position_bias = layer_outputs[2] if self.is_decoder and encoder_hidden_states is not None: encoder_decoder_position_bias = layer_outputs[ 4 if output_attentions else 3] # append next layer key value states if use_cache: present_key_value_states = present_key_value_states + ( present_key_value_state, ) if output_attentions: all_attentions = all_attentions + (layer_outputs[3], ) if self.is_decoder: all_cross_attentions = all_cross_attentions + ( layer_outputs[5], ) hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) # Add last layer if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) if not return_dict: return tuple(v for v in [ hidden_states, present_key_value_states, all_hidden_states, all_attentions, all_cross_attentions, ] if v is not None) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=present_key_value_states, hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, ) class T5EncoderModel(nn.Layer): authorized_missing_keys = [r"encoder.embed_tokens.weight", ] def __init__(self, vocab_size=32128, d_model=768, d_kv=64, d_ff=3072, num_layers=12, num_decoder_layers=12, num_heads=12, relative_attention_num_buckets=32, dropout_rate=0.1, layer_norm_epsilon=1e-06, feed_forward_proj="relu"): super().__init__() self.shared = nn.Embedding(vocab_size, d_model) # self.extra_parameters = list(self.shared.parameters()) use_cache = False is_encoder_decoder = False self.encoder = T5Stack( d_model, num_layers, layer_norm_epsilon, dropout_rate, relative_attention_num_buckets, feed_forward_proj, d_kv, num_heads, d_ff, embed_tokens=self.shared, is_decoder=False) def get_input_embeddings(self): return self.shared def set_input_embeddings(self, new_embeddings): self.shared = new_embeddings self.encoder.set_input_embeddings(new_embeddings) def get_encoder(self): return self.encoder def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" Returns: Example: ```python >>> from transformers import T5Tokenizer, T5EncoderModel >>> tokenizer = T5Tokenizer.from_pretrained("t5-small") >>> model = T5EncoderModel.from_pretrained("t5-small") >>> input_ids = tokenizer( ... "Studies have been shown that owning a dog is good for you", return_tensors="pt" ... ).input_ids # Batch size 1 >>> outputs = model(input_ids=input_ids) >>> last_hidden_states = outputs.last_hidden_state ```""" return_dict = return_dict if return_dict is not None else True #import numpy as np #attention_mask = paddle.to_tensor(np.load('attn_mask.npy')) #input_ids = paddle.to_tensor(np.load('input_ids.npy')) encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return encoder_outputs def T5Model(config): config = T5Config(**config) model = T5EncoderModel(config) return model def get_t5_model(name, pretrained=True): #t5_config = dict_from_json_file(name) #model = T5Model(t5_config) model = T5EncoderModel( vocab_size=32128, d_model=1024, d_kv=128, d_ff=65536, num_layers=2, num_decoder_layers=None, num_heads=128, relative_attention_num_buckets=32, dropout_rate=0., layer_norm_epsilon=1e-06, feed_forward_proj="relu") if pretrained: checkpoint = paddle.load(name + '/t5.pd', return_numpy=True) model.set_state_dict(checkpoint['model']) model.eval() for p in model.parameters(): p.stop_gradient = True return model def t5_11b(): return T5EncoderModel( vocab_size=32128, d_model=1024, d_kv=128, d_ff=65536, num_layers=24, num_decoder_layers=None, num_heads=128, relative_attention_num_buckets=32, dropout_rate=0., layer_norm_epsilon=1e-06, feed_forward_proj="relu") def dict_from_json_file(name): with open(name + '/config.json', "r", encoding="utf-8") as reader: text = reader.read() config_dict = json.loads(text) return config_dict def t5_encode_text(t5, texts, tokenizer, return_attn_mask=False): token_ids, attn_mask = t5_tokenize(texts, tokenizer) t5.eval() with paddle.no_grad(): encoded_text = t5(input_ids=token_ids, attention_mask=attn_mask) text_features = encoded_text.last_hidden_state.detach() if return_attn_mask: #attn_mask = attn_mask.cast('bool') return text_features, attn_mask return text_features def get_encoded_dim(name): return dict_from_json_file(name)['d_model'] ================================================ FILE: ppfleetx/models/language_model/t5/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from paddle.nn.initializer import TruncatedNormal, Constant, Normal trunc_normal_ = TruncatedNormal(std=0.02) zeros_ = Constant(value=0.0) ones_ = Constant(value=1.0) @paddle.no_grad() def constant_(x, value): temp_value = paddle.full(x.shape, value, x.dtype) x.set_value(temp_value) return x @paddle.no_grad() def normal_(x, mean=0., std=1.): temp_value = paddle.normal(mean, std, shape=x.shape) x.set_value(temp_value) return def normal_init(layer, mean=0, std=1, bias=0): if hasattr(layer, 'weight') and layer.weight is not None: normal_(layer.weight, mean, std) else: normal_(layer, mean, std) if hasattr(layer, 'bias') and layer.bias is not None: constant_(layer.bias, bias) def constant_init(layer, val, bias=0): if hasattr(layer, 'weight') and layer.weight is not None: constant_(layer.weight, val) if hasattr(layer, 'bias') and layer.bias is not None: constant_(layer.bias, bias) ================================================ FILE: ppfleetx/models/language_model/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import sys import copy import yaml import numpy as np import paddle import paddle.distributed as dist from paddle.fluid import core import argparse from functools import reduce from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger def is_fused_matmul_bias_supported(): if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue') else: return False def process_inference_configs(config): """ process inference configs for hybrid parallel """ if 'Inference' not in config.keys(): return configs = config['Inference'] if configs['model_dir'] is None: configs['model_dir'] = config['Engine']['save_load']['output_dir'] if configs['mp_degree'] is None: configs['mp_degree'] = config['Distributed']['mp_degree'] def process_model_configs(config): """ process model configs for hybrid parallel """ configs = config['Model'] if configs['ffn_hidden_size'] is None: configs['ffn_hidden_size'] = 4 * configs['hidden_size'] if configs['use_recompute']: if not configs['recompute_granularity']: configs['recompute_granularity'] = 'full' if not configs['no_recompute_layers']: configs['no_recompute_layers'] = [] else: assert isinstance(configs['no_recompute_layers'], list), "no_recompute_layers should be a list" for i in configs['no_recompute_layers']: assert isinstance( i, int ), "all values in no_recompute_layers should be an integer" assert min(configs['no_recompute_layers']) >= 0, \ "the min value in no_recompute_layers should >= 0" assert max(configs['no_recompute_layers']) < configs['num_layers'], \ "the max value in no_recompute_layers should < num_layers" configs['no_recompute_layers'] = sorted( list(set(configs['no_recompute_layers']))) if configs['fused_linear'] and not is_fused_matmul_bias_supported(): configs['fused_linear'] = False logging.warning( "The flag fused_linear only valid for cuda version higher than 11.6, " "but the paddle is compiled with cuda " + paddle.version.cuda()) pp_degree = config.Distributed.pp_degree if pp_degree > 1: configs['virtual_pp_degree'] = 1 \ if configs.get('virtual_pp_degree', None) is None \ else configs['virtual_pp_degree'] virtual_pp_degree = configs['virtual_pp_degree'] num_layers = configs.num_layers if not (num_layers % (virtual_pp_degree * pp_degree)) == 0: assert virtual_pp_degree == 1, "virtual pp doesn't support uneven layer split." logger.warning( "The num_layers of the model is not divisible by pp_degree." \ "Receive num_layers: {}, pp_degree: {}.".format(num_layers, pp_degree)) else: assert (num_layers % (virtual_pp_degree * pp_degree)) == 0, \ "The num_layers of the model should be divisible of pp_degree * virtual_pp_degree." \ "Receive num_layers: {}, pp_degree: {}, virtual_pp_degree: {}.".format( num_layers, pp_degree, virtual_pp_degree) if virtual_pp_degree > 1: local_batch_size = config.Global.local_batch_size micro_batch_size = config.Global.micro_batch_size acc_steps = local_batch_size // micro_batch_size assert acc_steps % pp_degree == 0, "num of microbatches {} should be divisible of pp_degree {} when " \ "using interleave pipeline".format(acc_steps, pp_degree) if virtual_pp_degree > 2: logger.warning( "Setting virtual_pp_degree > 2 may harm the throughput of the pipeline parallel." ) else: if configs.get('virtual_pp_degree', None): logger.warning("virtual_pp_degree is unuseful.") def process_optim_configs(config): """ process optim configs for hybrid parallel """ config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][ 'enable'] nranks = dist.get_world_size() dp_degree = config['Distributed']['dp_degree'] sharding_degree = config['Distributed']['sharding']['sharding_degree'] if config['Optimizer']['tensor_fusion']: assert nranks == dp_degree * sharding_degree, \ "tensor_fusion only support single card train or data/sharding parallel train" if config['Optimizer']['lr']['decay_steps'] is None: config['Optimizer']['lr']['decay_steps'] = config['Engine'][ 'max_steps'] config['Optimizer']['lr']['decay_steps'] *= config['Global'][ 'global_batch_size'] def process_data_configs(config): """ process data configs for hybrid parallel """ cfg_global = config['Global'] cfg_data = config['Data'] mode_to_num_samples = { "Train": cfg_global['global_batch_size'] * config['Engine']['max_steps'], "Eval": cfg_global['global_batch_size'] * (config['Engine']['max_steps'] // config['Engine']['eval_freq'] + 1) * config['Engine']['eval_iters'], "Test": cfg_global['global_batch_size'] * config['Engine']['test_iters'], } for mode in ("Train", "Eval", "Test"): if mode in cfg_data.keys(): cfg_data[mode]['dataset']['num_samples'] = mode_to_num_samples[ mode] cfg_data[mode]['dataset']['mode'] = mode cfg_data[mode]['dataset']['seed'] = cfg_global['seed'] cfg_data[mode]['dataset']['model_type'] = config['Model']['name'] cfg_data[mode]['sampler']['batch_size'] = cfg_global[ 'local_batch_size'] def process_configs(config): process_data_configs(config) process_model_configs(config) process_optim_configs(config) process_inference_configs(config) return config ================================================ FILE: ppfleetx/models/multimodal_model/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/multimodal_model/clip/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/multimodal_model/imagen/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .modeling import (ImagenModel, imagen_397M_text2im_64, imagen_text2im_64, imagen_text2im_64_debertav2, imagen_SR256, imagen_SR1024, ImagenCriterion) ================================================ FILE: ppfleetx/models/multimodal_model/imagen/modeling.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from tqdm import tqdm from functools import partial from contextlib import contextmanager, nullcontext import paddle import paddle.nn.functional as F from paddle import nn import paddle.vision.transforms as T from .unet import Unet from ppfleetx.models.language_model.debertav2 import * from ppfleetx.models.language_model.t5 import * from ppfleetx.data.tokenizers import get_t5_tokenizer, get_debertav2_tokenizer from .utils import ( GaussianDiffusionContinuousTimes, default, exists, cast_tuple, first, maybe, eval_decorator, identity, pad_tuple_to_length, right_pad_dims_to, resize_image_to, normalize_neg_one_to_one, rearrange, repeat, reduce, unnormalize_zero_to_one, cast_uint8_images_to_float, is_float_dtype) # predefined unets, with configs lining up with hyperparameters in appendix of paper class Unet64_397M(Unet): def __init__(self, *args, **kwargs): default_kwargs = dict( dim=256, dim_mults=(1, 2, 3, 4), num_resnet_blocks=3, layer_attns=(False, True, True, True), layer_cross_attns=(False, True, True, True), attn_heads=8, ff_mult=2., memory_efficient=False) super().__init__(*args, **{ ** default_kwargs, ** kwargs}) class BaseUnet64(Unet): def __init__(self, *args, **kwargs): default_kwargs = dict( dim=512, cond_dim=512, dim_mults=(1, 2, 3, 4), num_resnet_blocks=3, layer_attns=(False, True, True, True), layer_cross_attns=(False, True, True, True), attn_heads=8, ff_mult=2., memory_efficient=False) super().__init__(*args, **{ ** default_kwargs, ** kwargs}) class SRUnet256(Unet): def __init__(self, *args, **kwargs): default_kwargs = dict( dim=128, dim_mults=(1, 2, 4, 8), num_resnet_blocks=(2, 4, 8, 8), layer_attns=(False, False, False, True), layer_cross_attns=(False, False, False, True), attn_heads=8, ff_mult=2., memory_efficient=True) super().__init__(*args, **{ ** default_kwargs, ** kwargs}) class SRUnet1024(Unet): def __init__(self, *args, **kwargs): default_kwargs = dict( dim=128, dim_mults=(1, 2, 4, 8), num_resnet_blocks=(2, 4, 8, 8), layer_attns=False, layer_cross_attns=(False, False, False, True), attn_heads=8, ff_mult=2., memory_efficient=True) super().__init__(*args, **{ ** default_kwargs, ** kwargs}) # main imagen ddpm class, which is a cascading DDPM from Ho et al. class ImagenCriterion(nn.Layer): """ Criterion for Imagen. It calculates the final loss. """ def __init__(self, name='mse_loss', p2_loss_weight_k=1): super(ImagenCriterion, self).__init__() self.p2_loss_weight_k = p2_loss_weight_k if name == 'l1_loss': self.loss_func = F.l1_loss elif name == 'mse_loss': self.loss_func = F.mse_loss elif name == 'smooth_l1_loss': self.loss_func = F.smooth_l1_loss else: raise NotImplementedError() def forward(self, pred, target, log_snr, p2_loss_weight_gamma): """ Args: pred(Tensor): The logits of prediction. Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size]. target(Tensor): The labels of the prediction, default is noise. Returns: Tensor: The pretraining loss. Its data type should be float32 and its shape is [1]. """ losses = self.loss_func(pred, target, reduction="none") losses = reduce(losses, 'b ... -> b', 'mean') # p2 loss reweighting if p2_loss_weight_gamma > 0: loss_weight = ( self.p2_loss_weight_k + log_snr.exp())**-p2_loss_weight_gamma losses = losses * loss_weight return losses.mean() class ImagenModel(nn.Layer): def __init__( self, unets, image_sizes, text_encoder_name=None, text_embed_dim=1024, channels=3, timesteps=1000, cond_drop_prob=0.1, noise_schedules='cosine', pred_objectives='noise', random_crop_sizes=None, lowres_noise_schedule='linear', lowres_sample_noise_level=0.2, per_sample_random_aug_noise_level=False, condition_on_text=True, auto_normalize_img=True, p2_loss_weight_gamma=0.5, dynamic_thresholding=True, dynamic_thresholding_percentile=0.95, only_train_unet_number=None, is_sr=False, is_video=False, fused_linear=False, ): super().__init__() # conditioning hparams self.condition_on_text = condition_on_text self.unconditional = not condition_on_text self.is_sr = is_sr self.is_video = is_video # channels self.channels = channels # automatically take care of ensuring that first unet is unconditional # while the rest of the unets are conditioned on the low resolution image produced by previous unet unets = cast_tuple(unets) num_unets = len(unets) # determine noise schedules per unet timesteps = cast_tuple(timesteps, num_unets) # make sure noise schedule defaults to 'cosine', 'cosine', and then 'linear' for rest of super-resoluting unets noise_schedules = cast_tuple(noise_schedules) noise_schedules = pad_tuple_to_length(noise_schedules, 2, 'cosine') noise_schedules = pad_tuple_to_length(noise_schedules, num_unets, 'linear') # construct noise schedulers noise_scheduler_klass = GaussianDiffusionContinuousTimes self.noise_schedulers = nn.LayerList([]) for timestep, noise_schedule in zip(timesteps, noise_schedules): noise_scheduler = noise_scheduler_klass( noise_schedule=noise_schedule, timesteps=timestep) self.noise_schedulers.append(noise_scheduler) # randomly cropping for upsampler training self.random_crop_sizes = cast_tuple(random_crop_sizes, num_unets) assert not exists( first(self.random_crop_sizes) ), 'you should not need to randomly crop image during training for base unet, only for upsamplers - so pass in `random_crop_sizes = (None, 128, 256)` as example' # lowres augmentation noise schedule self.lowres_noise_schedule = GaussianDiffusionContinuousTimes( noise_schedule=lowres_noise_schedule) # ddpm objectives - predicting noise by default self.pred_objectives = cast_tuple(pred_objectives, num_unets) # get text encoder self.text_encoder_name = text_encoder_name if text_encoder_name is None: pass elif 't5' in text_encoder_name: self.text_embed_dim = default( text_embed_dim, lambda: get_encoded_dim(text_encoder_name)) self.t5_encoder = get_t5_model( name=text_encoder_name, pretrained=True) self.tokenizer = get_t5_tokenizer(name=text_encoder_name) self.t5_encode_text = t5_encode_text elif 'deberta' in text_encoder_name: self.text_embed_dim = default( text_embed_dim, lambda: get_debertav2_encoded_dim(text_encoder_name)) self.debertav2_encoder = get_debertav2_model( name=text_encoder_name, pretrained=True) self.tokenizer = get_debertav2_tokenizer(name=text_encoder_name) self.debertav2_encode_text = debertav2_encode_text else: raise NotImplementedError("Please implement the text encoder.") # construct unets self.unets = nn.LayerList([]) self.unet_being_trained_index = -1 # keeps track of which unet is being trained at the moment self.only_train_unet_number = only_train_unet_number for ind, one_unet in enumerate(unets): assert isinstance(one_unet, Unet) is_first = ind == 0 one_unet = one_unet.cast_model_parameters( cond_on_text=self.condition_on_text, text_embed_dim=self.text_embed_dim if self.condition_on_text else None, channels=self.channels, channels_out=self.channels) self.unets.append(one_unet) # unet image sizes image_sizes = cast_tuple(image_sizes) self.image_sizes = image_sizes self.sample_channels = cast_tuple(self.channels, num_unets) self.right_pad_dims_to_datatype = partial( rearrange, pattern=('b -> b 1 1 1')) # cascading ddpm related stuff self.lowres_sample_noise_level = lowres_sample_noise_level self.per_sample_random_aug_noise_level = per_sample_random_aug_noise_level # classifier free guidance self.cond_drop_prob = cond_drop_prob self.can_classifier_guidance = cond_drop_prob > 0. # normalize and unnormalize image functions self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity self.input_image_range = (0. if auto_normalize_img else -1., 1.) # dynamic thresholding self.dynamic_thresholding = cast_tuple(dynamic_thresholding, num_unets) self.dynamic_thresholding_percentile = dynamic_thresholding_percentile # p2 loss weight self.p2_loss_weight_gamma = cast_tuple(p2_loss_weight_gamma, num_unets) assert all([ (gamma_value <= 2) for gamma_value in self.p2_loss_weight_gamma ]), 'in paper, they noticed any gamma greater than 2 is harmful' # one temp parameter for keeping track of device def get_unet(self, unet_number): assert 0 < unet_number <= len(self.unets) index = unet_number - 1 if isinstance(self.unets, nn.LayerList): unets_list = [unet for unet in self.unets] delattr(self, 'unets') self.unets = unets_list self.unet_being_trained_index = index return self.unets[index] def reset_unets(self, ): self.unets = nn.LayerList([*self.unets]) self.unet_being_trained_index = -1 @contextmanager def one_unet_in_gpu(self, unet_number=None, unet=None): assert exists(unet_number) ^ exists(unet) if exists(unet_number): unet = self.unets[unet_number - 1] yield def reset_unets_all(self, ): self.unets = nn.LayerList([*self.unets]) self.unet_being_trained_index = -1 # overriding state dict functions def state_dict(self, *args, **kwargs): self.reset_unets() return super().state_dict(*args, **kwargs) def load_state_dict(self, *args, **kwargs): self.reset_unets_all() return self.unets[self.unet_being_trained_index].set_state_dict( *args, **kwargs) # gaussian diffusion methods def p_mean_variance(self, unet, x, t, *, noise_scheduler, text_embeds=None, text_mask=None, cond_images=None, lowres_cond_img=None, self_cond=None, lowres_noise_times=None, cond_scale=1., model_output=None, t_next=None, pred_objective='noise', dynamic_threshold=True): assert not ( cond_scale != 1. and not self.can_classifier_guidance ), 'imagen was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)' time_var = noise_scheduler.get_condition(t) pred = default(model_output, lambda: unet.forward_with_cond_scale(x, time_var, text_embeds = text_embeds, text_mask = text_mask, cond_images = cond_images, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, lowres_noise_times = self.lowres_noise_schedule.get_condition(lowres_noise_times))) if pred_objective == 'noise': x_start = noise_scheduler.predict_start_from_noise( x, t=t, noise=pred) elif pred_objective == 'x_start': x_start = pred elif pred_objective == 'v': x_start = noise_scheduler.predict_start_from_v(x, t=t, v=pred) else: raise ValueError(f'unknown objective {pred_objective}') if dynamic_threshold: # following pseudocode in appendix # s is the dynamic threshold, determined by percentile of absolute values of reconstructed sample per batch element s = paddle.quantile( rearrange(x_start, 'b ... -> b (...)').abs(), self.dynamic_thresholding_percentile, axis=-1) s.clip_(min=1.) s = right_pad_dims_to(x_start, s) x_start = x_start.clip(-s, s) / s else: x_start.clip_(-1., 1.) mean_and_variance = noise_scheduler.q_posterior( x_start=x_start, x_t=x, t=t, t_next=t_next) return mean_and_variance, x_start @paddle.no_grad() def p_sample(self, unet, x, t, *, noise_scheduler, t_next=None, text_embeds=None, text_mask=None, cond_images=None, cond_scale=1., self_cond=None, lowres_cond_img=None, lowres_noise_times=None, pred_objective='noise', dynamic_threshold=True): b = x.shape[0] (model_mean, _, model_log_variance), x_start = self.p_mean_variance( unet, x=x, t=t, t_next=t_next, noise_scheduler=noise_scheduler, text_embeds=text_embeds, text_mask=text_mask, cond_images=cond_images, cond_scale=cond_scale, lowres_cond_img=lowres_cond_img, self_cond=self_cond, lowres_noise_times=lowres_noise_times, pred_objective=pred_objective, dynamic_threshold=dynamic_threshold) noise = paddle.randn(shape=x.shape, dtype=x.dtype) # no noise when t == 0 is_last_sampling_timestep = (t_next == 0) if isinstance( noise_scheduler, GaussianDiffusionContinuousTimes) else (t == 0) nonzero_mask = (1 - is_last_sampling_timestep.cast('float32')).reshape( [b, *((1, ) * (len(x.shape) - 1))]) pred = model_mean + nonzero_mask * (0.5 * model_log_variance ).exp() * noise return pred, x_start @paddle.no_grad() def p_sample_loop(self, unet, shape, *, noise_scheduler, lowres_cond_img=None, lowres_noise_times=None, text_embeds=None, text_mask=None, cond_images=None, inpaint_images=None, inpaint_masks=None, inpaint_resample_times=5, init_images=None, skip_steps=None, cond_scale=1, pred_objective='noise', dynamic_threshold=True): batch = shape[0] img = paddle.randn(shape) # for initialization with an image or video if exists(init_images): img += init_images # keep track of x0, for self conditioning x_start = None # prepare inpainting has_inpainting = exists(inpaint_images) and exists(inpaint_masks) resample_times = inpaint_resample_times if has_inpainting else 1 if has_inpainting: inpaint_images = self.normalize_img(inpaint_images) inpaint_images = resize_image_to(inpaint_images, shape[-1]) inpaint_masks = resize_image_to( rearrange(inpaint_masks, 'b ... -> b 1 ...').cast('float32'), shape[-1]).cast('bool') # time timesteps = noise_scheduler.get_sampling_timesteps(batch) # whether to skip any steps skip_steps = default(skip_steps, 0) timesteps = timesteps[skip_steps:] for times, times_next in tqdm( timesteps, desc='sampling loop time step', total=len(timesteps)): is_last_timestep = times_next == 0 for r in reversed(range(resample_times)): is_last_resample_step = r == 0 if has_inpainting: noised_inpaint_images, *_ = noise_scheduler.q_sample( inpaint_images, t=times) img = img * ~inpaint_masks + noised_inpaint_images * inpaint_masks self_cond = x_start if unet.self_cond else None img, x_start = self.p_sample( unet, img, times, t_next=times_next, text_embeds=text_embeds, text_mask=text_mask, cond_images=cond_images, cond_scale=cond_scale, self_cond=self_cond, lowres_cond_img=lowres_cond_img, lowres_noise_times=lowres_noise_times, noise_scheduler=noise_scheduler, pred_objective=pred_objective, dynamic_threshold=dynamic_threshold) if has_inpainting and not (is_last_resample_step or paddle.all(is_last_timestep)): renoised_img = noise_scheduler.q_sample_from_to( img, times_next, times) img = paddle.where( self.right_pad_dims_to_datatype(is_last_timestep), img, renoised_img) img.clip_(-1., 1.) # final inpainting if has_inpainting: img = img * ~inpaint_masks + inpaint_images * inpaint_masks unnormalize_img = self.unnormalize_img(img) return unnormalize_img @paddle.no_grad() @eval_decorator def sample( self, texts=None, text_masks=None, text_embeds=None, cond_images=None, inpaint_images=None, inpaint_masks=None, inpaint_resample_times=5, init_images=None, skip_steps=None, batch_size=1, cond_scale=1., lowres_sample_noise_level=None, start_at_unet_number=1, start_image_or_video=None, stop_at_unet_number=None, return_all_unet_outputs=True, return_pil_images=False, ): self.reset_unets() cond_images = maybe(cast_uint8_images_to_float)(cond_images) if exists(texts) and not exists( text_embeds) and not self.unconditional: with paddle.amp.auto_cast(enable=False): if 't5' in self.text_encoder_name: text_embeds, text_masks = self.t5_encode_text( t5=self.t5_encoder, texts=texts, return_attn_mask=True) elif 'debert' in self.text_encoder_name: text_embeds, text_masks = self.debertav2_encode_text( debertav2=self.debertav2_encoder, texts=texts, return_attn_mask=True) if not self.unconditional: text_masks = default( text_masks, lambda: paddle.any(text_embeds != 0., axis=-1)) batch_size = text_embeds.shape[0] if exists(inpaint_images): if self.unconditional: if batch_size == 1: # assume researcher wants to broadcast along inpainted images batch_size = inpaint_images.shape[0] assert inpaint_images.shape[ 0] == batch_size, 'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=)``' assert not ( self.condition_on_text and inpaint_images.shape[0] != text_embeds.shape[0] ), 'number of inpainting images must be equal to the number of text to be conditioned on' assert not ( self.condition_on_text and not exists(text_embeds) ), 'text or text encodings must be passed into imagen if specified' assert not ( not self.condition_on_text and exists(text_embeds) ), 'imagen specified not to be conditioned on text, yet it is presented' assert not ( exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim ), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})' assert not ( exists(inpaint_images) ^ exists(inpaint_masks) ), 'inpaint images and masks must be both passed in to do inpainting' outputs = [] lowres_sample_noise_level = default(lowres_sample_noise_level, self.lowres_sample_noise_level) num_unets = len(self.unets) # condition scaling cond_scale = cast_tuple(cond_scale, num_unets) # for initial image and skipping steps init_images = cast_tuple(init_images, num_unets) init_images = [ maybe(self.normalize_img)(init_image) for init_image in init_images ] skip_steps = cast_tuple(skip_steps, num_unets) # handle starting at a unet greater than 1, for training only-upscaler training if start_at_unet_number > 1: assert not exists(stop_at_unet_number ) or start_at_unet_number <= stop_at_unet_number assert exists( start_image_or_video ), 'starting image or video must be supplied if only doing upscaling' prev_image_size = self.image_sizes[start_at_unet_number - 1] img = resize_image_to(start_image_or_video, prev_image_size) # go through each unet in cascade for unet_number, unet, channel, image_size, noise_scheduler, pred_objective, dynamic_threshold, unet_cond_scale, unet_init_images, unet_skip_steps in tqdm( zip( range(1, num_unets + 1), self.unets, self.sample_channels, self.image_sizes, self.noise_schedulers, self.pred_objectives, self.dynamic_thresholding, cond_scale, init_images, skip_steps)): lowres_cond_img = lowres_noise_times = None shape = (batch_size, channel, image_size, image_size) if unet.lowres_cond: lowres_noise_times = self.lowres_noise_schedule.get_times( batch_size, lowres_sample_noise_level) lowres_cond_img = resize_image_to(img, image_size) lowres_cond_img = self.normalize_img(lowres_cond_img) lowres_cond_img, *_ = self.lowres_noise_schedule.q_sample( x_start=lowres_cond_img, t=lowres_noise_times, noise=paddle.randn( shape=lowres_cond_img.shape, dtype=lowres_cond_img.dtype)) if exists(unet_init_images): unet_init_images = resize_image_to(unet_init_images, image_size) shape = (batch_size, self.channels, image_size, image_size) img = self.p_sample_loop( unet, shape, text_embeds=text_embeds, text_mask=text_masks, cond_images=cond_images, inpaint_images=inpaint_images, inpaint_masks=inpaint_masks, inpaint_resample_times=inpaint_resample_times, init_images=unet_init_images, skip_steps=unet_skip_steps, cond_scale=unet_cond_scale, lowres_cond_img=lowres_cond_img, lowres_noise_times=lowres_noise_times, noise_scheduler=noise_scheduler, pred_objective=pred_objective, dynamic_threshold=dynamic_threshold) outputs.append(img) if exists(stop_at_unet_number ) and stop_at_unet_number == unet_number: break output_index = -1 if not return_all_unet_outputs else slice( None) # either return last unet output or all unet outputs if not return_pil_images: return outputs[output_index] if not return_all_unet_outputs: outputs = outputs[-1:] pil_images = list( map(lambda img: list(map(T.ToPILImage(), img.unbind(dim=0))), outputs)) return pil_images[ output_index] # now you have a bunch of pillow images you can just .save(/where/ever/you/want.png) def p_losses(self, unet, x_start, times, *, noise_scheduler, lowres_cond_img=None, lowres_aug_times=None, text_embeds=None, text_mask=None, cond_images=None, noise=None, times_next=None, pred_objective='noise', p2_loss_weight_gamma=0., random_crop_size=None): is_video = x_start.ndim == 5 noise = default(noise, lambda: paddle.randn(shape=x_start.shape, dtype=x_start.dtype)) # normalize to [-1, 1] x_start = self.normalize_img(x_start) lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img) # random cropping during training # for upsamplers if exists(random_crop_size): if is_video: frames = x_start.shape[2] x_start, lowres_cond_img, noise = rearrange_many( (x_start, lowres_cond_img, noise), 'b c f h w -> (b f) c h w') aug = K.RandomCrop((random_crop_size, random_crop_size), p=1.) # make sure low res conditioner and image both get augmented the same way # detailed https://kornia.readthedocs.io/en/latest/augmentation.module.html?highlight=randomcrop#kornia.augmentation.RandomCrop x_start = aug(x_start) lowres_cond_img = aug(lowres_cond_img, params=aug._params) noise = aug(noise, params=aug._params) if is_video: x_start, lowres_cond_img, noise = rearrange_many( (x_start, lowres_cond_img, noise), '(b f) c h w -> b c f h w', f=frames) # get x_t x_noisy, log_snr, alpha, sigma = noise_scheduler.q_sample( x_start=x_start, t=times, noise=noise) # also noise the lowres conditioning image # at sample time, they then fix the noise level of 0.1 - 0.3 lowres_cond_img_noisy = None if exists(lowres_cond_img): lowres_aug_times = default(lowres_aug_times, times) lowres_cond_img_noisy, *_ = self.lowres_noise_schedule.q_sample( x_start=lowres_cond_img, t=lowres_aug_times, noise=paddle.randn( shape=lowres_cond_img.shape, dtype=lowres_cond_img.dtype)) # time condition noise_cond = noise_scheduler.get_condition(times) # unet kwargs unet_kwargs = dict( text_embeds=text_embeds, text_mask=text_mask, cond_images=cond_images, lowres_noise_times=self.lowres_noise_schedule.get_condition( lowres_aug_times), lowres_cond_img=lowres_cond_img_noisy, cond_drop_prob=self.cond_drop_prob, ) # self condition if needed # Because 'unet' can be an instance of DistributedDataParallel coming from the # ImagenTrainer.unet_being_trained when invoking ImagenTrainer.forward(), we need to # access the member 'module' of the wrapped unet instance. self_cond = unet._layers.self_cond if isinstance( unet, paddle.DataParallel) else unet.self_cond if self_cond and random() < 0.5: with paddle.no_grad(): pred = unet.forward(x_noisy, noise_cond, **unet_kwargs).detach() x_start = noise_scheduler.predict_start_from_noise( x_noisy, t=times, noise=pred) if pred_objective == 'noise' else pred unet_kwargs = { ** unet_kwargs, 'self_cond': x_start} # get prediction pred = unet.forward(x_noisy, noise_cond, **unet_kwargs) # prediction objective if pred_objective == 'noise': target = noise elif pred_objective == 'x_start': target = x_start elif pred_objective == 'v': # derivation detailed in Appendix D of Progressive Distillation paper # https://arxiv.org/abs/2202.00512 # this makes distillation viable as well as solve an issue with color shifting in upresoluting unets, noted in imagen-video target = alpha * noise - sigma * x_start else: raise ValueError(f'unknown objective {pred_objective}') return pred, target, log_snr, p2_loss_weight_gamma def forward(self, images, unet=None, texts=None, text_embeds=None, text_masks=None, unet_number=None, cond_images=None): if self.is_video and images.ndim == 4: images = rearrange(images, 'b c h w -> b c 1 h w') assert images.shape[-1] == images.shape[ -2], f'the images you pass in must be a square, but received dimensions of {images.shape[2]}, {images.shape[-1]}' assert not ( len(self.unets) > 1 and not exists(unet_number) ), f'you must specify which unet you want trained, from a range of 1 to {len(self.unets)}, if you are training cascading DDPM (multiple unets)' unet_number = default(unet_number, 1) assert not exists( self.only_train_unet_number ) or self.only_train_unet_number == unet_number, 'you can only train on unet #{self.only_train_unet_number}' images = cast_uint8_images_to_float(images) cond_images = maybe(cast_uint8_images_to_float)(cond_images) assert is_float_dtype( images.dtype ), f'images tensor needs to be floats but {images.dtype} dtype found instead' unet_index = unet_number - 1 unet = default(unet, lambda: self.get_unet(unet_number)) noise_scheduler = self.noise_schedulers[unet_index] p2_loss_weight_gamma = self.p2_loss_weight_gamma[unet_index] pred_objective = self.pred_objectives[unet_index] target_image_size = self.image_sizes[unet_index] random_crop_size = self.random_crop_sizes[unet_index] if self.is_sr: prev_image_size = self.image_sizes[unet_index - 1] else: prev_image_size = None b, c, h, w = images.shape assert images.shape[1] == self.channels assert h >= target_image_size and w >= target_image_size times = noise_scheduler.sample_random_times(b) if exists(texts) and not exists( text_embeds) and not self.unconditional: assert len(texts) == len( images ), 'number of text captions does not match up with the number of images given' with paddle.amp.auto_cast(enable=False): if 't5' in self.text_encoder_name: text_embeds, text_masks = self.t5_encode_text( t5=self.t5_encoder, texts=texts, tokenizer=self.tokenizer, return_attn_mask=True) elif 'deberta' in self.text_encoder_name: text_embeds, text_masks = self.debertav2_encode_text( debertav2=self.debertav2_encoder, texts=texts, tokenizer=self.tokenizer, return_attn_mask=True) else: raise NotImplementedError( "Please implement the text encoder.") if not self.unconditional: text_masks = default( text_masks, lambda: paddle.any(text_embeds != 0., axis=-1)) assert not ( self.condition_on_text and not exists(text_embeds) ), 'text or text encodings must be passed into decoder if specified' assert not ( not self.condition_on_text and exists(text_embeds) ), 'decoder specified not to be conditioned on text, yet it is presented' assert not ( exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim ), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})' lowres_cond_img = lowres_aug_times = None if exists(prev_image_size): lowres_cond_img = resize_image_to(images, prev_image_size) lowres_cond_img = resize_image_to(lowres_cond_img, target_image_size) if self.per_sample_random_aug_noise_level: lowres_aug_times = self.lowres_noise_schedule.sample_random_times( b) else: lowres_aug_time = self.lowres_noise_schedule.sample_random_times( 1) lowres_aug_times = repeat(lowres_aug_time, '1 -> b', b=b) images = resize_image_to(images, target_image_size) return self.p_losses( unet, images, times, text_embeds=text_embeds, text_mask=text_masks, cond_images=cond_images, noise_scheduler=noise_scheduler, lowres_cond_img=lowres_cond_img, lowres_aug_times=lowres_aug_times, pred_objective=pred_objective, p2_loss_weight_gamma=p2_loss_weight_gamma, random_crop_size=random_crop_size) def imagen_397M_text2im_64(**kwargs): use_recompute = kwargs.pop('use_recompute') recompute_granularity = kwargs.pop('recompute_granularity') model = ImagenModel( unets=Unet64_397M(use_recompute=use_recompute), image_sizes=(64, ), **kwargs) return model def imagen_text2im_64(**kwargs): use_recompute = kwargs.pop('use_recompute') recompute_granularity = kwargs.pop('recompute_granularity') if 'lowres_cond' in kwargs: lowres_cond = kwargs.pop('lowres_cond') else: lowres_cond = False model = ImagenModel( unets=BaseUnet64( lowres_cond=lowres_cond, use_recompute=use_recompute), image_sizes=(64, ), **kwargs) return model def imagen_text2im_64_debertav2(**kwargs): use_recompute = kwargs.pop('use_recompute') recompute_granularity = kwargs.pop('recompute_granularity') model = ImagenModel( unets=BaseUnet64( dim=360, use_recompute=use_recompute), image_sizes=(64, ), **kwargs) return model def imagen_text2im_64_SR256(**kwargs): use_recompute = kwargs.pop('use_recompute') recompute_granularity = kwargs.pop('recompute_granularity') model = ImagenModel( unets=(BaseUnet64(use_recompute=use_recompute), SRUnet256(use_recompute=use_recompute)), image_sizes=(64, 256), **kwargs) return model def imagen_SR256(**kwargs): use_recompute = kwargs.pop('use_recompute') recompute_granularity = kwargs.pop('recompute_granularity') if 'lowres_cond' in kwargs: lowres_cond = kwargs.pop('lowres_cond') else: lowres_cond = False model = ImagenModel( unets=SRUnet256( lowres_cond=lowres_cond, use_recompute=use_recompute), image_sizes=(256, 64), **kwargs) return model def imagen_SR1024(**kwargs): use_recompute = kwargs.pop('use_recompute') recompute_granularity = kwargs.pop('recompute_granularity') if 'lowres_cond' in kwargs: lowres_cond = kwargs.pop('lowres_cond') else: lowres_cond = False model = ImagenModel( unets=SRUnet1024( dim=128, lowres_cond=lowres_cond, use_recompute=use_recompute), image_sizes=(1024, 256), **kwargs) return model ================================================ FILE: ppfleetx/models/multimodal_model/imagen/unet.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from pathlib import Path from functools import partial import paddle from paddle import nn from paddle import nn, einsum import paddle.nn.functional as F from paddle.distributed.fleet.utils import recompute from .utils import (zeros_, zero_init_, default, exists, cast_tuple, l2norm, resize_image_to, prob_mask_like, masked_mean, Identity, repeat, repeat_many, Rearrange, rearrange, rearrange_many, EinopsToAndFrom, Parallel, Always, print_once) from ppfleetx.models.language_model.t5.modeling import finfo class LayerNorm(nn.Layer): def __init__(self, feats, stable=False, dim=-1): super().__init__() self.stable = stable self.dim = dim self.g = self.create_parameter( [feats, *((1, ) * (-dim - 1))], default_initializer=nn.initializer.Constant(value=1.)) def forward(self, x): dtype, dim = x.dtype, self.dim if self.stable: x = x / x.amax(axis=dim, keepdim=True).detach() eps = 1e-5 if x.dtype == paddle.float32 else 1e-3 var = paddle.var(x, axis=dim, unbiased=False, keepdim=True) mean = paddle.mean(x, axis=dim, keepdim=True) return (x - mean) * ( var + eps).rsqrt().cast(dtype) * self.g.cast(dtype) ChanLayerNorm = partial(LayerNorm, dim=-3) class Residual(nn.Layer): def __init__(self, fn): super().__init__() self.fn = fn def forward(self, x, **kwargs): return self.fn(x, **kwargs) + x # attention pooling class PerceiverAttention(nn.Layer): def __init__(self, *, dim, dim_head=64, heads=8, cosine_sim_attn=False): super().__init__() self.scale = dim_head**-0.5 if not cosine_sim_attn else 1 self.cosine_sim_attn = cosine_sim_attn self.cosine_sim_scale = 16 if cosine_sim_attn else 1 self.heads = heads inner_dim = dim_head * heads self.norm = nn.LayerNorm(dim) self.norm_latents = nn.LayerNorm(dim) self.to_q = nn.Linear(dim, inner_dim, bias_attr=False) self.to_kv = nn.Linear(dim, inner_dim * 2, bias_attr=False) self.to_out = nn.Sequential( nn.Linear( inner_dim, dim, bias_attr=False), nn.LayerNorm(dim)) def forward(self, x, latents, mask=None): x = self.norm(x) latents = self.norm_latents(latents) b, h = x.shape[0], self.heads q = self.to_q(latents) # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to kv_input = paddle.concat((x, latents), axis=-2) k, v = self.to_kv(kv_input).chunk(2, axis=-1) q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h=h) q = q * self.scale # cosine sim attention if self.cosine_sim_attn: q, k = map(l2norm, (q, k)) # similarities and masking sim = einsum('... i d, ... j d -> ... i j', q, k) * self.cosine_sim_scale if exists(mask): max_neg_value = -finfo(sim.dtype).max mask = F.pad(mask, (0, latents.shape[-2]), value=True) mask = rearrange(mask, 'b j -> b 1 1 j') sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim) # attention attn = F.softmax(sim, axis=-1, dtype=paddle.float32) attn = attn.cast(sim.dtype) out = einsum('... i j, ... j d -> ... i d', attn, v) B, H, N, D = out.shape out = out.transpose([0, 2, 1, 3]).reshape([B, N, -1]) return self.to_out(out) class PerceiverResampler(nn.Layer): def __init__( self, *, dim, depth, dim_head=64, heads=8, num_latents=64, num_latents_mean_pooled=4, # number of latents derived from mean pooled representation of the sequence max_seq_len=512, ff_mult=4, cosine_sim_attn=False): super().__init__() self.pos_emb = nn.Embedding(max_seq_len, dim) self.latents = self.create_parameter( [num_latents, dim], default_initializer=nn.initializer.Normal()) self.to_latents_from_mean_pooled_seq = None if num_latents_mean_pooled > 0: self.to_latents_from_mean_pooled_seq = nn.Sequential( LayerNorm(dim), nn.Linear(dim, dim * num_latents_mean_pooled), Rearrange( 'b (n d) -> b n d', n=num_latents_mean_pooled)) self.layers = nn.LayerList([]) for _ in range(depth): self.layers.append( nn.LayerList([ PerceiverAttention( dim=dim, dim_head=dim_head, heads=heads, cosine_sim_attn=cosine_sim_attn), FeedForward( dim=dim, mult=ff_mult) ])) def forward(self, x, mask=None): n = x.shape[1] pos_emb = self.pos_emb(paddle.arange(n)) x_with_pos = x + pos_emb latents = repeat(self.latents, 'n d -> b n d', b=x.shape[0]) if exists(self.to_latents_from_mean_pooled_seq): meanpooled_seq = masked_mean( x, axis=1, mask=paddle.ones( x.shape[:2], dtype=paddle.bool)) meanpooled_latents = self.to_latents_from_mean_pooled_seq( meanpooled_seq) latents = paddle.concat((meanpooled_latents, latents), axis=-2) for attn, ff in self.layers: latents = attn(x_with_pos, latents, mask=mask) + latents latents = ff(latents) + latents return latents # attention class Attention(nn.Layer): def __init__( self, dim, *, dim_head=64, heads=8, context_dim=None, cosine_sim_attn=False, use_recompute=False, ): super().__init__() self.use_recompute = use_recompute self.scale = dim_head**-0.5 if not cosine_sim_attn else 1. self.cosine_sim_attn = cosine_sim_attn self.cosine_sim_scale = 16 if cosine_sim_attn else 1 self.heads = heads inner_dim = dim_head * heads self.norm = LayerNorm(dim) self.null_kv = self.create_parameter( [2, dim_head], default_initializer=nn.initializer.Normal()) self.to_q = nn.Linear(dim, inner_dim, bias_attr=False) self.to_kv = nn.Linear(dim, dim_head * 2, bias_attr=False) self.to_context = nn.Sequential( nn.LayerNorm(context_dim), nn.Linear( context_dim, dim_head * 2)) if exists(context_dim) else None self.to_out = nn.Sequential( nn.Linear( inner_dim, dim, bias_attr=False), LayerNorm(dim)) def forward(self, x, context=None, mask=None, attn_bias=None): if self.use_recompute: return recompute(self._forward, x, context, mask, attn_bias) else: return self._forward(x, context, mask, attn_bias) def _forward(self, x, context=None, mask=None, attn_bias=None): b, n = x.shape[:2] x = self.norm(x) q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, axis=-1)) q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads) q = q * self.scale # add null key / value for classifier free guidance in prior net nk, nv = repeat_many(self.null_kv.unbind(axis=-2), 'd -> b 1 d', b=b) k = paddle.concat((nk, k), axis=-2) v = paddle.concat((nv, v), axis=-2) # add text conditioning, if present if exists(context): assert exists(self.to_context) ck, cv = self.to_context(context).chunk(2, axis=-1) k = paddle.concat((ck, k), axis=-2) v = paddle.concat((cv, v), axis=-2) # cosine sim attention if self.cosine_sim_attn: q, k = map(l2norm, (q, k)) # calculate query / key similarities sim = einsum('b h i d, b j d -> b h i j', q, k) * self.cosine_sim_scale # relative positional encoding (T5 style) if exists(attn_bias): sim = sim + attn_bias # masking max_neg_value = -finfo(sim.dtype).max if exists(mask): mask = F.pad(mask, (1, 0), value=True) mask = rearrange(mask, 'b j -> b 1 1 j') sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim) # attention attn = F.softmax(sim, axis=-1, dtype=paddle.float32) # aggregate values out = einsum('b h i j, b j d -> b h i d', attn, v) out = rearrange(out, 'b h n d -> b n (h d)') return self.to_out(out) # decoder def Upsample(dim, dim_out=None): dim_out = default(dim_out, dim) return nn.Sequential( nn.Upsample( scale_factor=2, mode='nearest'), nn.Conv2D( dim, dim_out, 3, padding=1)) class PixelShuffleUpsample(nn.Layer): """ code shared by @MalumaDev at DALLE2 for addressing checkboard artifacts https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf """ def __init__(self, dim, dim_out=None): super().__init__() dim_out = default(dim_out, dim) conv = nn.Conv2D(dim, dim_out * 4, 1) self.net = nn.Sequential(conv, nn.Silu(), nn.PixelShuffle(2)) self.init_conv_(conv) def init_conv_(self, conv): o, i, h, w = conv.weight.shape conv_weight = paddle.empty([o // 4, i, h, w]) nn.initializer.KaimingUniform(conv_weight) conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...') conv.weight.set_value(conv_weight) zeros_(conv.bias) def forward(self, x): return self.net(x) def Downsample(dim, dim_out=None): dim_out = default(dim_out, dim) return nn.Sequential( Rearrange( 'b c (h s1) (w s2) -> b (c s1 s2) h w', s1=2, s2=2), nn.Conv2D(dim * 4, dim_out, 1)) class SinusoidalPosEmb(nn.Layer): def __init__(self, dim): super().__init__() self.dim = dim def forward(self, x): half_dim = self.dim // 2 emb = math.log(10000) / (half_dim - 1) emb = paddle.exp(paddle.arange(half_dim) * -emb) emb = x[:, None] * emb[None, :] return paddle.concat((emb.sin(), emb.cos()), axis=-1) class LearnedSinusoidalPosEmb(nn.Layer): """ following @crowsonkb 's lead with learned sinusoidal pos emb """ """ https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """ def __init__(self, dim): super().__init__() assert (dim % 2) == 0 half_dim = dim // 2 self.weights = self.create_parameter( [half_dim], default_initializer=nn.initializer.Normal()) def forward(self, x): x = x[:, None] freqs = x * self.weights[None, :] * 2 * math.pi fouriered = paddle.concat((freqs.sin(), freqs.cos()), axis=-1) fouriered = paddle.concat((x, fouriered), axis=-1) return fouriered class Block(nn.Layer): def __init__(self, dim, dim_out, groups=8, norm=True): super().__init__() self.groupnorm = nn.GroupNorm(groups, dim) if norm else Identity() self.activation = nn.Silu() self.project = nn.Conv2D(dim, dim_out, 3, padding=1) def forward(self, x, scale_shift=None): x = self.groupnorm(x) if exists(scale_shift): scale, shift = scale_shift x = x * (scale + 1) + shift x = self.activation(x) return self.project(x) class ResnetBlock(nn.Layer): def __init__(self, dim, dim_out, *, cond_dim=None, time_cond_dim=None, groups=8, linear_attn=False, use_gca=False, squeeze_excite=False, use_recompute=False, **attn_kwargs): super().__init__() self.time_mlp = None self.use_recompute = use_recompute if exists(time_cond_dim): self.time_mlp = nn.Sequential( nn.Silu(), nn.Linear(time_cond_dim, dim_out * 2)) self.cross_attn = None if exists(cond_dim): attn_klass = CrossAttention if not linear_attn else LinearCrossAttention self.cross_attn = attn_klass( dim=dim_out, context_dim=cond_dim, **attn_kwargs) self.block1 = Block(dim, dim_out, groups=groups) self.block2 = Block(dim_out, dim_out, groups=groups) self.gca = GlobalContext( dim_in=dim_out, dim_out=dim_out) if use_gca else Always(1) self.res_conv = nn.Conv2D(dim, dim_out, 1) if dim != dim_out else Identity() def forward(self, x, time_emb=None, cond=None): scale_shift = None if exists(self.time_mlp) and exists(time_emb): time_emb = self.time_mlp(time_emb) time_emb = time_emb[:, :, None, None] scale_shift = time_emb.chunk(2, axis=1) h = self.block1(x) if exists(self.cross_attn): assert exists(cond) h = h.transpose([0, 2, 3, 1]) n, b, c, *_ = h.shape h = h.reshape([n, b * c, -1]) h = self.cross_attn(h, context=cond) + h h = h.reshape([n, b, c, -1]) h = h.transpose([0, 3, 1, 2]) h = self.block2(h, scale_shift=scale_shift) h = h * self.gca(h) return h + self.res_conv(x) class CrossAttention(nn.Layer): def __init__(self, dim, *, context_dim=None, dim_head=64, heads=8, norm_context=False, cosine_sim_attn=False): super().__init__() self.scale = dim_head**-0.5 if not cosine_sim_attn else 1. self.cosine_sim_attn = cosine_sim_attn self.cosine_sim_scale = 16 if cosine_sim_attn else 1 self.heads = heads inner_dim = dim_head * heads context_dim = default(context_dim, dim) self.norm = LayerNorm(dim) self.norm_context = LayerNorm( context_dim) if norm_context else Identity() self.null_kv = self.create_parameter( [2, dim_head], default_initializer=nn.initializer.Normal()) self.to_q = nn.Linear(dim, inner_dim, bias_attr=False) self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias_attr=False) self.to_out = nn.Sequential( nn.Linear( inner_dim, dim, bias_attr=False), LayerNorm(dim)) def forward(self, x, context, mask=None): b, n = x.shape[:2] x = self.norm(x) context = self.norm_context(context) q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, axis=-1)) q, k, v = rearrange_many( (q, k, v), 'b n (h d) -> b h n d', h=self.heads) # add null key / value for classifier free guidance in prior net nk, nv = repeat_many( self.null_kv.unbind(axis=-2), 'd -> b h 1 d', h=self.heads, b=b) k = paddle.concat((nk, k), axis=-2) v = paddle.concat((nv, v), axis=-2) q = q * self.scale # cosine sim attention if self.cosine_sim_attn: q, k = map(l2norm, (q, k)) # similarities sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.cosine_sim_scale # masking max_neg_value = -finfo(sim.dtype).max if exists(mask): mask = F.pad(mask, (1, 0), value=True) mask = rearrange(mask, 'b j -> b 1 1 j') sim = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), sim) attn = F.softmax(sim, axis=-1, dtype=paddle.float32) attn = attn.cast(sim.dtype) out = einsum('b h i j, b h j d -> b h i d', attn, v) out = rearrange(out, 'b h n d -> b n (h d)') return self.to_out(out) class LinearCrossAttention(CrossAttention): def forward(self, x, context, mask=None): b, n = x.shape[:2] x = self.norm(x) context = self.norm_context(context) q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, axis=-1)) q, k, v = rearrange_many( (q, k, v), 'b n (h d) -> (b h) n d', h=self.heads) # add null key / value for classifier free guidance in prior net nk, nv = repeat_many( self.null_kv.unbind(axis=-2), 'd -> (b h) 1 d', h=self.heads, b=b) k = paddle.concat((nk, k), axis=-2) v = paddle.concat((nv, v), axis=-2) # masking max_neg_value = -finfo(x.dtype).max if exists(mask): mask = F.pad(mask, (1, 0), value=True) mask = rearrange(mask, 'b n -> b n 1') k = paddle.where(mask == 0, paddle.to_tensor(max_neg_value), k) v = paddle.where(mask == 0, paddle.to_tensor(0.), v) # linear attention q = F.softmax(q, axis=-1) k = F.softmax(k, axis=-2) q = q * self.scale context = einsum('b n d, b n e -> b d e', k, v) out = einsum('b n d, b d e -> b n e', q, context) out = rearrange(out, '(b h) n d -> b n (h d)', h=self.heads) return self.to_out(out) class LinearAttention(nn.Layer): def __init__(self, dim, dim_head=32, heads=8, dropout=0.05, context_dim=None, **kwargs): super().__init__() self.scale = dim_head**-0.5 self.heads = heads inner_dim = dim_head * heads self.norm = ChanLayerNorm(dim) self.nonlin = nn.Silu() self.to_q = nn.Sequential( nn.Dropout(dropout), nn.Conv2D( dim, inner_dim, 1, bias_attr=False), nn.Conv2D( inner_dim, inner_dim, 3, bias_attr=False, padding=1, groups=inner_dim)) self.to_k = nn.Sequential( nn.Dropout(dropout), nn.Conv2D( dim, inner_dim, 1, bias_attr=False), nn.Conv2D( inner_dim, inner_dim, 3, bias_attr=False, padding=1, groups=inner_dim)) self.to_v = nn.Sequential( nn.Dropout(dropout), nn.Conv2D( dim, inner_dim, 1, bias_attr=False), nn.Conv2D( inner_dim, inner_dim, 3, bias_attr=False, padding=1, groups=inner_dim)) self.to_context = nn.Sequential( nn.LayerNorm(context_dim), nn.Linear( context_dim, inner_dim * 2, bias_attr=False)) if exists(context_dim) else None self.to_out = nn.Sequential( nn.Conv2D( inner_dim, dim, 1, bias_attr=False), ChanLayerNorm(dim)) def forward(self, fmap, context=None): h, x, y = self.heads, *fmap.shape[-2:] fmap = self.norm(fmap) q, k, v = map(lambda fn: fn(fmap), (self.to_q, self.to_k, self.to_v)) q, k, v = rearrange_many( (q, k, v), 'b (h c) x y -> (b h) (x y) c', h=h) if exists(context): assert exists(self.to_context) ck, cv = self.to_context(context).chunk(2, axis=-1) ck, cv = rearrange_many((ck, cv), 'b n (h d) -> (b h) n d', h=h) k = paddle.concat((k, ck), axis=-2) v = paddle.concat((v, cv), axis=-2) q = F.softmax(q, axis=-1) k = F.softmax(k, axis=-2) q = q * self.scale context = einsum('b n d, b n e -> b d e', k, v) out = einsum('b n d, b d e -> b n e', q, context) out = rearrange(out, '(b h) (x y) d -> b (h d) x y', h=h, x=x, y=y) out = self.nonlin(out) return self.to_out(out) class GlobalContext(nn.Layer): """ basically a superior form of squeeze-excitation that is attention-esque """ def __init__(self, *, dim_in, dim_out): super().__init__() self.to_k = nn.Conv2D(dim_in, 1, 1) hidden_dim = max(3, dim_out // 2) self.net = nn.Sequential( nn.Conv2D(dim_in, hidden_dim, 1), nn.Silu(), nn.Conv2D(hidden_dim, dim_out, 1), nn.Sigmoid()) def forward(self, x): context = self.to_k(x) x, context = rearrange_many((x, context), 'b n ... -> b n (...)') out = einsum('b i n, b c n -> b c i', F.softmax(context, axis=-1), x) out = out[:, :, :, None] return self.net(out) def FeedForward(dim, mult=2): hidden_dim = int(dim * mult) return nn.Sequential( LayerNorm(dim), nn.Linear( dim, hidden_dim, bias_attr=False), nn.GELU(), LayerNorm(hidden_dim), nn.Linear( hidden_dim, dim, bias_attr=False)) def ChanFeedForward( dim, mult=2 ): # in paper, it seems for self attention layers they did feedforwards with twice channel width hidden_dim = int(dim * mult) return nn.Sequential( ChanLayerNorm(dim), nn.Conv2D( dim, hidden_dim, 1, bias_attr=False), nn.GELU(), ChanLayerNorm(hidden_dim), nn.Conv2D( hidden_dim, dim, 1, bias_attr=False)) class TransformerBlock(nn.Layer): def __init__( self, dim, *, depth=1, heads=8, dim_head=32, ff_mult=2, context_dim=None, cosine_sim_attn=False, use_recompute=False, ): super().__init__() self.layers = nn.LayerList([]) for _ in range(depth): self.layers.append( nn.LayerList([ Attention( dim=dim, heads=heads, dim_head=dim_head, context_dim=context_dim, cosine_sim_attn=cosine_sim_attn, use_recompute=use_recompute), FeedForward( dim=dim, mult=ff_mult) ])) def forward(self, x, context=None): x = x.transpose([0, 2, 3, 1]) n, b, c, *_ = x.shape x = x.reshape([n, b * c, -1]) for attn, ff in self.layers: x = attn(x, context=context) + x x = ff(x) + x x = x.reshape([n, b, c, -1]) x = x.transpose([0, 3, 1, 2]) return x class LinearAttentionTransformerBlock(nn.Layer): def __init__(self, dim, *, depth=1, heads=8, dim_head=32, ff_mult=2, context_dim=None, **kwargs): super().__init__() self.layers = nn.LayerList([]) for _ in range(depth): self.layers.append( nn.LayerList([ LinearAttention( dim=dim, heads=heads, dim_head=dim_head, context_dim=context_dim), ChanFeedForward( dim=dim, mult=ff_mult) ])) def forward(self, x, context=None): for attn, ff in self.layers: x = attn(x, context=context) + x x = ff(x) + x return x class CrossEmbedLayer(nn.Layer): def __init__(self, dim_in, kernel_sizes, dim_out=None, stride=2): super().__init__() assert all([*map(lambda t: (t % 2) == (stride % 2), kernel_sizes)]) dim_out = default(dim_out, dim_in) kernel_sizes = sorted(kernel_sizes) num_scales = len(kernel_sizes) # calculate the dimension at each scale dim_scales = [int(dim_out / (2**i)) for i in range(1, num_scales)] dim_scales = [*dim_scales, dim_out - sum(dim_scales)] self.convs = nn.LayerList([]) for kernel, dim_scale in zip(kernel_sizes, dim_scales): self.convs.append( nn.Conv2D( dim_in, dim_scale, kernel, stride=stride, padding=(kernel - stride) // 2)) def forward(self, x): fmaps = tuple(map(lambda conv: conv(x), self.convs)) return paddle.concat(fmaps, axis=1) class UpsampleCombiner(nn.Layer): def __init__(self, dim, *, enabled=False, dim_ins=tuple(), dim_outs=tuple()): super().__init__() dim_outs = cast_tuple(dim_outs, len(dim_ins)) assert len(dim_ins) == len(dim_outs) self.enabled = enabled if not self.enabled: self.dim_out = dim return self.fmap_convs = nn.LayerList([ Block(dim_in, dim_out) for dim_in, dim_out in zip(dim_ins, dim_outs) ]) self.dim_out = dim + (sum(dim_outs) if len(dim_outs) > 0 else 0) def forward(self, x, fmaps=None): target_size = x.shape[-1] fmaps = default(fmaps, tuple()) if not self.enabled or len(fmaps) == 0 or len(self.fmap_convs) == 0: return x fmaps = [resize_image_to(fmap, target_size) for fmap in fmaps] outs = [conv(fmap) for fmap, conv in zip(fmaps, self.fmap_convs)] return paddle.concat((x, *outs), axis=1) class Unet(nn.Layer): def __init__(self, *, dim, image_embed_dim=1024, text_embed_dim=1024, num_resnet_blocks=1, cond_dim=None, num_image_tokens=4, num_time_tokens=2, learned_sinu_pos_emb_dim=16, out_dim=None, dim_mults=(1, 2, 4, 8), cond_images_channels=0, channels=3, channels_out=None, attn_dim_head=64, attn_heads=8, ff_mult=2., lowres_cond=False, layer_attns=True, layer_attns_depth=1, layer_mid_attns_depth=1, layer_attns_add_text_cond=True, attend_at_middle=True, layer_cross_attns=True, use_linear_attn=False, use_linear_cross_attn=False, cond_on_text=True, max_text_len=256, init_dim=None, resnet_groups=8, init_conv_kernel_size=7, init_cross_embed=True, init_cross_embed_kernel_sizes=(3, 7, 15), cross_embed_downsample=False, cross_embed_downsample_kernel_sizes=(2, 4), attn_pool_text=True, attn_pool_num_latents=32, dropout=0., memory_efficient=False, init_conv_to_final_conv_residual=False, use_global_context_attn=True, scale_skip_connection=True, final_resnet_block=True, final_conv_kernel_size=3, cosine_sim_attn=False, self_cond=False, combine_upsample_fmaps=False, pixel_shuffle_upsample=True, use_recompute=False): super().__init__() self.use_recompute = use_recompute # guide researchers assert attn_heads > 1, 'you need to have more than 1 attention head, ideally at least 4 or 8' if dim < 128: print_once( 'The base dimension of your u-net should ideally be no smaller than 128, as recommended by a professional DDPM trainer https://nonint.com/2022/05/04/friends-dont-let-friends-train-small-diffusion-models/' ) # save locals to take care of some hyperparameters for cascading DDPM self._locals = locals() self._locals.pop('self', None) self._locals.pop('__class__', None) # determine dimensions self.channels = channels self.channels_out = default(channels_out, channels) init_channels = channels * (1 + int(lowres_cond) + int(self_cond)) init_dim = default(init_dim, dim) self.self_cond = self_cond # optional image conditioning self.has_cond_image = cond_images_channels > 0 self.cond_images_channels = cond_images_channels init_channels += cond_images_channels # initial convolution self.init_conv = CrossEmbedLayer( init_channels, dim_out=init_dim, kernel_sizes=init_cross_embed_kernel_sizes, stride=1) if init_cross_embed else nn.Conv2D( init_channels, init_dim, init_conv_kernel_size, padding=init_conv_kernel_size // 2) dims = [init_dim, *map(lambda m: dim * m, dim_mults)] in_out = list(zip(dims[:-1], dims[1:])) # time conditioning cond_dim = default(cond_dim, dim) time_cond_dim = dim * 4 * (2 if lowres_cond else 1) # embedding time for log(snr) noise from continuous version sinu_pos_emb = LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim) sinu_pos_emb_input_dim = learned_sinu_pos_emb_dim + 1 self.to_time_hiddens = nn.Sequential( sinu_pos_emb, nn.Linear(sinu_pos_emb_input_dim, time_cond_dim), nn.Silu()) self.to_time_cond = nn.Sequential( nn.Linear(time_cond_dim, time_cond_dim)) # project to time tokens as well as time hiddens self.to_time_tokens = nn.Sequential( nn.Linear(time_cond_dim, cond_dim * num_time_tokens), Rearrange( 'b (n d) -> b n d', n=num_time_tokens)) # low res aug noise conditioning self.lowres_cond = lowres_cond if lowres_cond: self.to_lowres_time_hiddens = nn.Sequential( LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim), nn.Linear(learned_sinu_pos_emb_dim + 1, time_cond_dim), nn.Silu()) self.to_lowres_time_cond = nn.Sequential( nn.Linear(time_cond_dim, time_cond_dim)) self.to_lowres_time_tokens = nn.Sequential( nn.Linear(time_cond_dim, cond_dim * num_time_tokens), Rearrange( 'b (n d) -> b n d', n=num_time_tokens)) # normalizations self.norm_cond = nn.LayerNorm(cond_dim) # text encoding conditioning (optional) self.text_to_cond = None if cond_on_text: assert exists( text_embed_dim ), 'text_embed_dim must be given to the unet if cond_on_text is True' self.text_to_cond = nn.Linear(text_embed_dim, cond_dim) # finer control over whether to condition on text encodings self.cond_on_text = cond_on_text # attention pooling self.attn_pool = PerceiverResampler( dim=cond_dim, depth=2, dim_head=attn_dim_head, heads=attn_heads, num_latents=attn_pool_num_latents, cosine_sim_attn=cosine_sim_attn) if attn_pool_text else None # for classifier free guidance self.max_text_len = max_text_len self.null_text_embed = self.create_parameter( [1, max_text_len, cond_dim], default_initializer=nn.initializer.Normal()) self.null_text_hidden = self.create_parameter( [1, time_cond_dim], default_initializer=nn.initializer.Normal()) # for non-attention based text conditioning at all points in the network where time is also conditioned self.to_text_non_attn_cond = None if cond_on_text: self.to_text_non_attn_cond = nn.Sequential( nn.LayerNorm(cond_dim), nn.Linear(cond_dim, time_cond_dim), nn.Silu(), nn.Linear(time_cond_dim, time_cond_dim)) # attention related params attn_kwargs = dict( heads=attn_heads, dim_head=attn_dim_head, cosine_sim_attn=cosine_sim_attn, use_recompute=use_recompute) num_layers = len(in_out) # resnet block klass num_resnet_blocks = cast_tuple(num_resnet_blocks, num_layers) resnet_groups = cast_tuple(resnet_groups, num_layers) resnet_klass = partial(ResnetBlock, **attn_kwargs) layer_attns = cast_tuple(layer_attns, num_layers) layer_attns_depth = cast_tuple(layer_attns_depth, num_layers) layer_cross_attns = cast_tuple(layer_cross_attns, num_layers) use_linear_attn = cast_tuple(use_linear_attn, num_layers) use_linear_cross_attn = cast_tuple(use_linear_cross_attn, num_layers) assert all([ layers == num_layers for layers in list( map(len, (resnet_groups, layer_attns, layer_cross_attns))) ]) # downsample klass downsample_klass = Downsample if cross_embed_downsample: downsample_klass = partial( CrossEmbedLayer, kernel_sizes=cross_embed_downsample_kernel_sizes) # initial resnet block (for memory efficient unet) self.init_resnet_block = resnet_klass( init_dim, init_dim, time_cond_dim=time_cond_dim, groups=resnet_groups[0], use_gca=use_global_context_attn) if memory_efficient else None # scale for resnet skip connections self.skip_connect_scale = 1. if not scale_skip_connection else (2 **-0.5) # layers self.downs = nn.LayerList([]) self.ups = nn.LayerList([]) num_resolutions = len(in_out) layer_params = [ num_resnet_blocks, resnet_groups, layer_attns, layer_attns_depth, layer_cross_attns, use_linear_attn, use_linear_cross_attn ] reversed_layer_params = list(map(reversed, layer_params)) # downsampling layers skip_connect_dims = [] # keep track of skip connection dimensions for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn, layer_use_linear_attn, layer_use_linear_cross_attn ) in enumerate(zip(in_out, *layer_params)): is_last = ind >= (num_resolutions - 1) layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None if layer_attn: transformer_block_klass = TransformerBlock elif layer_use_linear_attn: transformer_block_klass = LinearAttentionTransformerBlock else: transformer_block_klass = Identity current_dim = dim_in # whether to pre-downsample, from memory efficient unet pre_downsample = None if memory_efficient: pre_downsample = downsample_klass(dim_in, dim_out) current_dim = dim_out skip_connect_dims.append(current_dim) # whether to do post-downsample, for non-memory efficient unet post_downsample = None if not memory_efficient: post_downsample = downsample_klass( current_dim, dim_out) if not is_last else Parallel( nn.Conv2D( dim_in, dim_out, 3, padding=1), nn.Conv2D(dim_in, dim_out, 1)) self.downs.append( nn.LayerList([ pre_downsample, resnet_klass( current_dim, current_dim, cond_dim=layer_cond_dim, linear_attn=layer_use_linear_cross_attn, time_cond_dim=time_cond_dim, groups=groups, use_recompute=use_recompute), nn.LayerList([ ResnetBlock( current_dim, current_dim, time_cond_dim=time_cond_dim, groups=groups, use_gca=use_global_context_attn, use_recompute=use_recompute) for _ in range(layer_num_resnet_blocks) ]), transformer_block_klass( dim=current_dim, depth=layer_attn_depth, ff_mult=ff_mult, context_dim=cond_dim, **attn_kwargs), post_downsample ])) # middle layers mid_dim = dims[-1] self.mid_block1 = ResnetBlock( mid_dim, mid_dim, cond_dim=cond_dim, time_cond_dim=time_cond_dim, groups=resnet_groups[-1], use_recompute=use_recompute) self.mid_attn = TransformerBlock( mid_dim, depth=layer_mid_attns_depth, **attn_kwargs) if attend_at_middle else None self.mid_block2 = ResnetBlock( mid_dim, mid_dim, cond_dim=cond_dim, time_cond_dim=time_cond_dim, groups=resnet_groups[-1], use_recompute=use_recompute) # upsample klass upsample_klass = Upsample if not pixel_shuffle_upsample else PixelShuffleUpsample # upsampling layers upsample_fmap_dims = [] for ind, ( (dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn, layer_use_linear_attn, layer_use_linear_cross_attn ) in enumerate(zip(reversed(in_out), *reversed_layer_params)): is_last = ind == (len(in_out) - 1) layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None if layer_attn: transformer_block_klass = TransformerBlock elif layer_use_linear_attn: transformer_block_klass = LinearAttentionTransformerBlock else: transformer_block_klass = Identity skip_connect_dim = skip_connect_dims.pop() upsample_fmap_dims.append(dim_out) self.ups.append( nn.LayerList([ resnet_klass( dim_out + skip_connect_dim, dim_out, cond_dim=layer_cond_dim, linear_attn=layer_use_linear_cross_attn, time_cond_dim=time_cond_dim, groups=groups, use_recompute=use_recompute), nn.LayerList([ ResnetBlock( dim_out + skip_connect_dim, dim_out, time_cond_dim=time_cond_dim, groups=groups, use_gca=use_global_context_attn, use_recompute=use_recompute) for _ in range(layer_num_resnet_blocks) ]), transformer_block_klass( dim=dim_out, depth=layer_attn_depth, ff_mult=ff_mult, context_dim=cond_dim, **attn_kwargs), upsample_klass(dim_out, dim_in) if not is_last or memory_efficient else Identity() ])) # whether to combine feature maps from all upsample blocks before final resnet block out self.upsample_combiner = UpsampleCombiner( dim=dim, enabled=combine_upsample_fmaps, dim_ins=upsample_fmap_dims, dim_outs=dim) # whether to do a final residual from initial conv to the final resnet block out self.init_conv_to_final_conv_residual = init_conv_to_final_conv_residual final_conv_dim = self.upsample_combiner.dim_out + ( dim if init_conv_to_final_conv_residual else 0) # final optional resnet block and convolution out self.final_res_block = ResnetBlock( final_conv_dim, dim, time_cond_dim=time_cond_dim, groups=resnet_groups[0], use_gca=True, use_recompute=use_recompute) if final_resnet_block else None final_conv_dim_in = dim if final_resnet_block else final_conv_dim final_conv_dim_in += (channels if lowres_cond else 0) self.final_conv = nn.Conv2D( final_conv_dim_in, self.channels_out, final_conv_kernel_size, padding=final_conv_kernel_size // 2) zero_init_(self.final_conv) # if the current settings for the unet are not correct # for cascading DDPM, then reinit the unet with the right settings def cast_model_parameters(self, *, text_embed_dim, channels, channels_out, cond_on_text): if channels == self.channels and \ cond_on_text == self.cond_on_text and \ text_embed_dim == self._locals['text_embed_dim'] and \ channels_out == self.channels_out: return self updated_kwargs = dict( text_embed_dim=text_embed_dim, channels=channels, channels_out=channels_out, cond_on_text=cond_on_text) return self.__class__(**{ ** self._locals, ** updated_kwargs}) # methods for returning the full unet config as well as its parameter state def to_config_and_state_dict(self): return self._locals, self.state_dict() # class method for rehydrating the unet from its config and state dict @classmethod def from_config_and_state_dict(klass, config, state_dict): unet = klass(**config) unet.load_state_dict(state_dict) return unet # methods for persisting unet to disk def persist_to_file(self, path): path = Path(path) path.parents[0].mkdir(exist_ok=True, parents=True) config, state_dict = self.to_config_and_state_dict() pkg = dict(config=config, state_dict=state_dict) paddle.save(pkg, str(path)) # class method for rehydrating the unet from file saved with `persist_to_file` @classmethod def hydrate_from_file(klass, path): path = Path(path) assert path.exists() pkg = paddle.load(str(path)) assert 'config' in pkg and 'state_dict' in pkg config, state_dict = pkg['config'], pkg['state_dict'] return Unet.from_config_and_state_dict(config, state_dict) # forward with classifier free guidance def forward_with_cond_scale(self, *args, cond_scale=1., **kwargs): #print("forward_with_cond_scale.args[1]: ", args[1]) logits = self.forward(*args, **kwargs) if cond_scale == 1: return logits null_logits = self.forward(*args, cond_drop_prob=1., **kwargs) return null_logits + (logits - null_logits) * cond_scale def forward(self, x, time, *, lowres_cond_img=None, lowres_noise_times=None, text_embeds=None, text_mask=None, self_cond=None, cond_images=None, cond_drop_prob=0., use_recompute=False): batch_size = x.shape[0] # condition on self if self.self_cond: self_cond = default(self_cond, lambda: paddle.zeros_like(x)) x = paddle.concat((x, self_cond), axis=1) # add low resolution conditioning, if present assert not (self.lowres_cond and not exists(lowres_cond_img) ), 'low resolution conditioning image must be present' assert not (self.lowres_cond and not exists(lowres_noise_times) ), 'low resolution conditioning noise time must be present' if exists(lowres_cond_img): x = paddle.concat((x, lowres_cond_img), axis=1) # condition on input image assert not ( self.has_cond_image ^ exists(cond_images) ), 'you either requested to condition on an image on the unet, but the conditioning image is not supplied, or vice versa' if exists(cond_images): assert cond_images.shape[ 1] == self.cond_images_channels, 'the number of channels on the conditioning image you are passing in does not match what you specified on initialiation of the unet' cond_images = resize_image_to(cond_images, x.shape[-1]) x = paddle.concat((cond_images, x), axis=1) # initial convolution x = self.init_conv(x) # init conv residual if self.init_conv_to_final_conv_residual: init_conv_residual = x.clone() # time conditioning time_hiddens = self.to_time_hiddens(time) # derive time tokens time_tokens = self.to_time_tokens(time_hiddens) t = self.to_time_cond(time_hiddens) if use_recompute: t.stop_gradient = True # add lowres time conditioning to time hiddens # and add lowres time tokens along sequence dimension for attention if self.lowres_cond: lowres_time_hiddens = self.to_lowres_time_hiddens( lowres_noise_times) lowres_time_tokens = self.to_lowres_time_tokens( lowres_time_hiddens) lowres_t = self.to_lowres_time_cond(lowres_time_hiddens) t = t + lowres_t time_tokens = paddle.concat( (time_tokens, lowres_time_tokens), axis=-2) # text conditioning text_tokens = None if exists(text_embeds) and self.cond_on_text: # conditional dropout text_keep_mask = prob_mask_like((batch_size, ), 1 - cond_drop_prob) text_keep_mask_embed = text_keep_mask[:, None, None] text_keep_mask_hidden = text_keep_mask[:, None] # calculate text embeds text_tokens = self.text_to_cond(text_embeds) text_tokens = text_tokens[:, :self.max_text_len] if exists(text_mask): text_mask = text_mask[:, :self.max_text_len] text_tokens_len = text_tokens.shape[1] remainder = self.max_text_len - text_tokens_len if remainder > 0: text_tokens = F.pad(text_tokens, (0, remainder), data_format='NLC') if exists(text_mask): text_mask = text_mask[:, :, None].cast('float32') if remainder > 0: text_mask = F.pad(text_mask, (0, remainder), data_format='NLC') text_keep_mask_embed = text_mask.cast( bool) & text_keep_mask_embed null_text_embed = self.null_text_embed.cast(text_tokens.dtype) text_tokens = paddle.where(text_keep_mask_embed, text_tokens, null_text_embed) if exists(self.attn_pool): text_tokens = self.attn_pool(text_tokens) # extra non-attention conditioning by projecting and then summing text embeddings to time # termed as text hiddens mean_pooled_text_tokens = text_tokens.mean(axis=-2) text_hiddens = self.to_text_non_attn_cond(mean_pooled_text_tokens) null_text_hidden = self.null_text_hidden.cast(t.dtype) text_hiddens = paddle.where(text_keep_mask_hidden, text_hiddens, null_text_hidden) t = t + text_hiddens # main conditioning tokens (c) c = time_tokens if not exists(text_tokens) else paddle.concat( (time_tokens, text_tokens), axis=-2) # normalize conditioning tokens c = self.norm_cond(c) if use_recompute: c.stop_gradient = True # initial resnet block (for memory efficient unet) if exists(self.init_resnet_block): x = self.init_resnet_block(x, t) hiddens = [] for pre_downsample, init_block, resnet_blocks, attn_block, post_downsample in self.downs: if exists(pre_downsample): x = pre_downsample(x) x = init_block(x, t, c) for resnet_block in resnet_blocks: x = resnet_block(x, t) hiddens.append(x) x = attn_block(x, c) hiddens.append(x) if exists(post_downsample): x = post_downsample(x) x = self.mid_block1(x, t, c) if exists(self.mid_attn): x = self.mid_attn(x) x = self.mid_block2(x, t, c) add_skip_connection = lambda x: paddle.concat((x, hiddens.pop() * self.skip_connect_scale), axis=1) up_hiddens = [] for init_block, resnet_blocks, attn_block, upsample in self.ups: x = add_skip_connection(x) x = init_block(x, t, c) for resnet_block in resnet_blocks: x = add_skip_connection(x) x = resnet_block(x, t) x = attn_block(x, c) up_hiddens.append(x) x = upsample(x) x = self.upsample_combiner(x, up_hiddens) if self.init_conv_to_final_conv_residual: x = paddle.concat((x, init_conv_residual), axis=1) if exists(self.final_res_block): x = self.final_res_block(x, t) if exists(lowres_cond_img): x = paddle.concat((x, lowres_cond_img), axis=1) return self.final_conv(x) ================================================ FILE: ppfleetx/models/multimodal_model/imagen/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from functools import partial, wraps import paddle from paddle import nn import paddle.nn.functional as F from paddle import expm1 # helper functions def exists(val): return val is not None def identity(t, *args, **kwargs): return t def first(arr, d=None): if len(arr) == 0: return d return arr[0] def maybe(fn): @wraps(fn) def inner(x): if not exists(x): return x return fn(x) return inner def once(fn): called = False @wraps(fn) def inner(x): nonlocal called if called: return called = True return fn(x) return inner print_once = once(print) def default(val, d): if exists(val): return val return d() if callable(d) else d def cast_tuple(val, length=None): if isinstance(val, list): val = tuple(val) output = val if isinstance(val, tuple) else ((val, ) * default(length, 1)) if exists(length): assert len(output) == length return output def is_float_dtype(dtype): return any([ dtype == float_dtype for float_dtype in (paddle.float64, paddle.float32, paddle.float16, paddle.bfloat16) ]) def cast_uint8_images_to_float(images): if not images.dtype == paddle.uint8: return images return images / 255 zeros_ = nn.initializer.Constant(value=0.) def zero_init_(m): zeros_(m.weight) if exists(m.bias): zeros_(m.bias) def eval_decorator(fn): def inner(model, *args, **kwargs): was_training = model.training model.eval() out = fn(model, *args, **kwargs) if was_training: model.train(was_training) return out return inner def pad_tuple_to_length(t, length, fillvalue=None): remain_length = length - len(t) if remain_length <= 0: return t return (*t, *((fillvalue, ) * remain_length)) # helper classes class Identity(nn.Layer): def __init__(self, *args, **kwargs): super().__init__() def forward(self, x, *args, **kwargs): return x # tensor helpers def log(t, eps: float=1e-12): return paddle.log(t.clip(min=eps)) class Parallel(nn.Layer): def __init__(self, *fns): super().__init__() self.fns = nn.LayerList(fns) def forward(self, x): outputs = [fn(x) for fn in self.fns] return sum(outputs) def l2norm(t): return F.normalize(t, axis=-1) def right_pad_dims_to(x, t): padding_dims = x.ndim - t.ndim if padding_dims <= 0: return t return t.reshape([*t.shape, *((1, ) * padding_dims)]) def masked_mean(t, *, axis, mask=None): if not exists(mask): return t.mean(axis=axis) denom = mask.sum(axis=axis, keepdim=True) mask = mask[:, :, None] masked_t = paddle.where(mask == 0, paddle.to_tensor(0.), t) return masked_t.sum(axis=axis) / denom.clip(min=1e-5) def resize_image_to(image, target_image_size, clamp_range=None): orig_image_size = image.shape[-1] if orig_image_size == target_image_size: return image out = F.interpolate( image, (target_image_size, target_image_size), mode='nearest') if exists(clamp_range): out = out.clip(*clamp_range) return out # image normalization functions # ddpms expect images to be in the range of -1 to 1 def normalize_neg_one_to_one(img): return img * 2 - 1 def unnormalize_zero_to_one(normed_img): return (normed_img + 1) * 0.5 # classifier free guidance functions def prob_mask_like(shape, prob): if prob == 1: return paddle.ones(shape, dtype=paddle.bool) elif prob == 0: return paddle.zeros(shape, dtype=paddle.bool) else: return paddle.zeros(shape).cast('float32').uniform_(0, 1) < prob def rearrange(tensor, pattern: str, b: int=-1, h: int=-1, w: int=-1, c: int=-1, x: int=-1, y: int=-1, n: int=-1, s1: int=-1, s2: int=-1): if pattern == 'b n (h d) -> b h n d': B, N, _ = tensor.shape return tensor.reshape([B, N, h, -1]).transpose([0, 2, 1, 3]) elif pattern == 'b n (h d) -> (b h) n d': B, N, _ = tensor.shape return tensor.reshape([B, N, h, -1]).transpose([0, 2, 1, 3]).reshape( [B * h, N, -1]) elif pattern == 'b (h c) x y -> (b h) (x y) c': B, _, _, _ = tensor.shape return tensor.reshape([B, h, -1, x, y]).transpose( [0, 1, 3, 4, 2]).reshape([B * h, x * y, -1]) elif pattern == 'b n ... -> b n (...)': B, N = tensor.shape[:2] return tensor.reshape([B, N, -1]) elif pattern == 'b ... -> b (...)': B = tensor.shape[0] return tensor.reshape([B, -1]) elif pattern == 'b j -> b 1 1 j': return tensor[:, None, None, :] elif pattern == 'b h n d -> b n (h d)': B, H, N, D = tensor.shape return tensor.transpose([0, 2, 1, 3]).reshape([B, N, -1]) elif pattern == '(b h) (x y) d -> b (h d) x y': _, _, D = tensor.shape return tensor.reshape([-1, h, x, y, D]).transpose( [0, 1, 4, 2, 3]).reshape([-1, h * D, x, y]) elif pattern == '(b h) n d -> b n (h d)': _, N, D = tensor.shape return tensor.reshape([-1, h, N, D]).transpose([0, 2, 1, 3]).reshape( [-1, N, h * D]) elif pattern == 'b n -> b n 1': return tensor[:, :, None] elif pattern == 'b c h w -> b (h w) c': B, C, H, W = tensor.shape return tensor.transpose([0, 2, 3, 1]).reshape([B, -1, C]) elif pattern == 'b (h w) c -> b c h w': B, _, C = tensor.shape return tensor.reshape([B, h, w, C]).transpose([0, 3, 1, 2]) elif pattern == 'b (n d) -> b n d': B, _ = tensor.shape return tensor.reshape([B, n, -1]) elif pattern == 'b ... -> b 1 ...': return tensor[:, None] elif pattern == 'b -> b 1 1 1': return tensor[:, None, None, None] elif pattern == 'b c (h s1) (w s2) -> b (c s1 s2) h w': assert s1 is not None assert s2 is not None B, C, H, W = tensor.shape tensor = tensor.reshape([B, C, H // s1, s1, W // s2, s2]) tensor = tensor.transpose([0, 1, 3, 5, 2, 4]) return tensor.reshape([B, C * s1 * s2, H // s1, W // s2]) def rearrange_many(tensors, pattern: str, h: int=-1, x: int=-1, y: int=-1): assert isinstance(tensors, ( list, tuple)), "rearrange_many type must be list or tuple" if isinstance(tensors, tuple): tensors = list(tensors) if len(tensors) == 0: raise TypeError("Rearrange can't be applied to an empty list") for i, tensor in enumerate(tensors): tensors[i] = rearrange(tensor, pattern, h=h, x=x, y=y) return tensors def repeat(tensor, pattern: str, h: int=-1, b: int=-1): if pattern == '1 -> b': if b > 1: b = paddle.to_tensor([b]) return paddle.tile(tensor, repeat_times=b) else: return tensor elif pattern == 't -> b t': tensor = tensor[None, :] return paddle.tile(tensor, repeat_times=(b, 1)) elif pattern == 'n d -> b n d': tensor = tensor[None, :] return paddle.tile(tensor, repeat_times=(b, 1, 1)) elif pattern == 'o ... -> (o 4) ...': return paddle.tile(tensor, repeat_times=(4, 1, 1, 1)) elif pattern == 'd -> b h 1 d': tensor = tensor[None, None, None, :] return paddle.tile(tensor, repeat_times=(b, h, 1, 1)) elif pattern == 'd -> b 1 d': tensor = tensor[None, None, :] return paddle.tile(tensor, repeat_times=(b, 1, 1)) def repeat_many(tensors, pattern: str, h: int=-1, b: int=-1): assert isinstance(tensors, (list, tuple)) if isinstance(tensors, tuple): tensors = list(tensors) if len(tensors) == 0: raise TypeError("Rearrange can't be applied to an empty list") for i, tensor in enumerate(tensors): tensors[i] = repeat(tensor, pattern, h=h, b=b) return tensors def reduce(losses, pattern: str, reduction: str='mean'): if pattern == 'b ... -> b': axes = list(range(1, len(losses.shape))) return losses.mean(axes) class EinopsToAndFrom(nn.Layer): def __init__(self, from_einops, to_einops, fn): super().__init__() self.from_einops = from_einops self.to_einops = to_einops self.fn = fn def forward(self, x, **kwargs): shape = x.shape reconstitute_kwargs = dict( tuple(zip(self.from_einops.split(' '), shape))) x = rearrange(x, f'{self.from_einops} -> {self.to_einops}') x = self.fn(x, **kwargs) x = rearrange(x, f'{self.to_einops} -> {self.from_einops}', **reconstitute_kwargs) return x class Rearrange(nn.Layer): def __init__(self, pattern, n=None, s1=None, s2=None): super().__init__() self.pattern = pattern self.n = n self.s1 = s1 self.s2 = s2 def forward(self, x, **kwargs): x = rearrange(x, f'{self.pattern}', n=self.n, s1=self.s1, s2=self.s2) return x # classifier free guidance functions # gaussian diffusion with continuous time helper functions and classes # large part of this was thanks to @crowsonkb at https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/utils.py def beta_linear_log_snr(t): return -paddle.log(expm1(1e-4 + 10 * (t**2))) def alpha_cosine_log_snr(t, s: float=0.008): return -log( (paddle.cos((t + s) / (1 + s) * math.pi * 0.5)**-2) - 1, eps=1e-5 ) # not sure if this accounts for beta being clipped to 0.999 in discrete version def log_snr_to_alpha_sigma(log_snr): return paddle.sqrt(F.sigmoid(log_snr)), paddle.sqrt(F.sigmoid(-log_snr)) class GaussianDiffusionContinuousTimes(nn.Layer): def __init__(self, *, noise_schedule, timesteps=1000): super().__init__() if noise_schedule == 'linear': self.log_snr = beta_linear_log_snr elif noise_schedule == "cosine": self.log_snr = alpha_cosine_log_snr else: raise ValueError(f'invalid noise schedule {noise_schedule}') self.num_timesteps = timesteps def get_times(self, batch_size, noise_level): return paddle.full((batch_size, ), noise_level, dtype=paddle.float32) def sample_random_times(self, batch_size): return paddle.zeros((batch_size, )).cast('float32').uniform_(0, 1) def get_condition(self, times): return maybe(self.log_snr)(times) def get_sampling_timesteps(self, batch): times = paddle.linspace(1., 0., self.num_timesteps + 1) times = repeat(times, 't -> b t', b=batch) times = paddle.stack((times[:, :-1], times[:, 1:]), axis=0) times = times.unbind(axis=-1) return times def q_posterior(self, x_start, x_t, t, *, t_next=None): t_next = default( t_next, lambda: (t - 1. / self.num_timesteps).clip(min=0.)) """ https://openreview.net/attachment?id=2LdBqxc1Yv&name=supplementary_material """ log_snr = self.log_snr(t) log_snr_next = self.log_snr(t_next) log_snr, log_snr_next = map( partial(right_pad_dims_to, x_t), (log_snr, log_snr_next)) alpha, sigma = log_snr_to_alpha_sigma(log_snr) alpha_next, sigma_next = log_snr_to_alpha_sigma(log_snr_next) # c - as defined near eq 33 c = -expm1(log_snr - log_snr_next) posterior_mean = alpha_next * (x_t * (1 - c) / alpha + c * x_start) # following (eq. 33) posterior_variance = (sigma_next**2) * c posterior_log_variance_clipped = log(posterior_variance, eps=1e-20) return posterior_mean, posterior_variance, posterior_log_variance_clipped def q_sample(self, x_start, t, noise=None): dtype = x_start.dtype if isinstance(t, float): batch = x_start.shape[0] t = paddle.full((batch, ), t, dtype=dtype) noise = default(noise, lambda: paddle.randn(shape=x_start.shape, dtype=dtype)) log_snr = self.log_snr(t).cast(dtype) log_snr_padded_dim = right_pad_dims_to(x_start, log_snr) alpha, sigma = log_snr_to_alpha_sigma(log_snr_padded_dim) return alpha * x_start + sigma * noise, log_snr, alpha, sigma def q_sample_from_to(self, x_from, from_t, to_t, noise=None): shape, dtype = x_from.shape, x_from.dtype batch = shape[0] if isinstance(from_t, float): from_t = paddle.full((batch, ), from_t, dtype=dtype) if isinstance(to_t, float): to_t = paddle.full((batch, ), to_t, dtype=dtype) noise = default(noise, lambda: paddle.randn(shape=x_from.shape, dtype=x_from.dtype)) log_snr = self.log_snr(from_t) log_snr_padded_dim = right_pad_dims_to(x_from, log_snr) alpha, sigma = log_snr_to_alpha_sigma(log_snr_padded_dim) log_snr_to = self.log_snr(to_t) log_snr_padded_dim_to = right_pad_dims_to(x_from, log_snr_to) alpha_to, sigma_to = log_snr_to_alpha_sigma(log_snr_padded_dim_to) return x_from * (alpha_to / alpha) + noise * (sigma_to * alpha - sigma * alpha_to) / alpha def predict_start_from_v(self, x_t, t, v): log_snr = self.log_snr(t) log_snr = right_pad_dims_to(x_t, log_snr) alpha, sigma = log_snr_to_alpha_sigma(log_snr) return alpha * x_t - sigma * v def predict_start_from_noise(self, x_t, t, noise): log_snr = self.log_snr(t) log_snr = right_pad_dims_to(x_t, log_snr) alpha, sigma = log_snr_to_alpha_sigma(log_snr) return (x_t - sigma * noise) / alpha.clip(min=1e-8) class Always(): def __init__(self, val): self.val = val def __call__(self, *args, **kwargs): return self.val ================================================ FILE: ppfleetx/models/multimodal_model/multimodal_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import copy import paddle from ppfleetx.core.module.basic_module import BasicModule import ppfleetx.models.multimodal_model.imagen as imagen from ppfleetx.utils.log import logger from .utils import process_configs class MultiModalModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() super(MultiModalModule, self).__init__(configs) self.loss_fn = self.get_loss_fn() def process_configs(self, configs): configs = process_configs(configs) return configs def forward(self, batch): return self.model(**batch) def training_step(self, batch): preds, targets, log_snr, p2_loss_weight_gamma = self(batch) loss = self.loss_fn(preds, targets, log_snr, p2_loss_weight_gamma) return loss def training_step_end(self, log_dict): speed = self.configs.Engine.logging_freq / log_dict['train_cost'] logger.info( "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, learning rate: %.5e" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], 1. / speed, speed, log_dict['lr'])) def validation_step(self, batch): tokens, position_ids, labels, loss_mask = batch preds = self(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = self.loss_fn(preds, labels, loss_mask) return loss def validation_step_end(self, log_dict): speed = self.configs.Engine.logging_freq / log_dict['eval_cost'] logger.info( "[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], 1. / speed, speed)) def test_step(self, batch): tokens, position_ids, labels, loss_mask = batch preds = self(tokens, position_ids) preds = paddle.cast(preds, dtype="float32") loss = self.loss_fn(preds, labels, loss_mask) return loss def test_step_end(self, log_dict): speed = self.configs.Engine.logging_freq / log_dict['test_cost'] logger.info( "[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s" % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], 1. / speed, speed)) def input_spec(self): return [ InputSpec( shape=[None, None], name="tokens", dtype='int64'), InputSpec( shape=[None, None], name="ids", dtype='int64') ] def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) class ImagenModule(MultiModalModule): def __init__(self, configs): super(ImagenModule, self).__init__(configs) def get_model(self): model_setting = copy.deepcopy(self.configs.Model) model_setting.pop("module") imagen_model = model_setting.pop("name") model = getattr(imagen, imagen_model)(**model_setting) return model def get_loss_fn(self): model_setting = copy.deepcopy(self.configs.Loss) loss_fn = imagen.ImagenCriterion(**model_setting) return loss_fn def pretreating_batch(self, batch): return batch ================================================ FILE: ppfleetx/models/multimodal_model/utils.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import sys import copy import yaml import numpy as np import paddle import paddle.distributed as dist from paddle.fluid import core import argparse from functools import reduce from ppfleetx.distributed.apis import env def process_global_configs(config): """ process global configs for hybrid parallel """ dp_degree = config['Distributed']['dp_degree'] sharding_degree = config['Distributed']['sharding']['sharding_degree'] configs = config['Global'] if configs['global_batch_size'] is None and configs[ 'local_batch_size'] is None: raise ValueError( "global_batch_size or local_batch_size should be set.") elif configs['global_batch_size'] is not None and configs[ 'local_batch_size'] is not None: assert configs['global_batch_size'] // configs['local_batch_size'] == (dp_degree * sharding_degree), "global_batch_size[{}] should be divided by local_batch_size[{}] "\ "when dp_degree is [{}] and sharding_degree is [{}]".format(configs['global_batch_size'], configs['local_batch_size'], dp_degree, sharding_degree) elif configs['global_batch_size'] is not None and configs[ 'local_batch_size'] is None: assert configs['global_batch_size'] % (dp_degree * sharding_degree) == 0, \ "global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]"\ .format(configs['global_batch_size'], dp_degree, sharding_degree) configs['local_batch_size'] = configs['global_batch_size'] // ( dp_degree * sharding_degree) else: configs['global_batch_size'] = configs[ 'local_batch_size'] * dp_degree * sharding_degree assert configs['local_batch_size'] % configs['micro_batch_size'] == 0 def is_fused_matmul_bias_supported(): if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue') else: return False def process_fused_configs(config): """ process fused configs for hybrid parallel """ nranks = dist.get_world_size() dp_degree = config['Distributed']['dp_degree'] configs = config['Fused'] if configs['tensor_fusion']: assert nranks == dp_degree, "tensor_fusion only support single card train or data parallel train" def process_inference_configs(config): """ process fused configs for hybrid parallel """ configs = config['Inference'] if configs['model_dir'] is None: configs['model_dir'] = config['Engine']['save_load']['output_dir'] if configs['mp_degree'] is None: configs['mp_degree'] = config['Distributed']['mp_degree'] def process_model_configs(config): """ process model configs for hybrid parallel """ configs = config['Model'] if configs['use_recompute']: if not configs['recompute_granularity']: configs['recompute_granularity'] = 'full' if configs['fused_linear'] and not is_fused_matmul_bias_supported(): configs['fused_linear'] = False logging.warning( "The flag fused_linear only valid for cuda version higher than 11.6, " "but the paddle is compiled with cuda " + paddle.version.cuda()) def process_optim_configs(config): """ process optim configs for hybrid parallel """ config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][ 'enable'] def process_engine_configs(config): """ process engine configs for hybrid parallel """ configs = config['Engine'] configs['test_iters'] = configs['eval_iters'] * 10 \ if configs.get('test_iters', None) is None \ else configs['test_iters'] configs['accumulate_steps'] = config['Global']['local_batch_size'] \ // config['Global']['micro_batch_size'] def process_configs(config): process_fused_configs(config) process_model_configs(config) process_optim_configs(config) process_inference_configs(config) return config ================================================ FILE: ppfleetx/models/protein_folding/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/protein_folding/all_atom.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, Optional import paddle from .common import (batched_gather, ) from . import ( residue_constants, r3, ) def get_chi_atom_indices(): """Returns atom indices needed to compute chi angles for all residue types. Returns: A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are in the order specified in residue_constants.restypes + unknown residue type at the end. For chi angles which are not defined on the residue, the positions indices are by default set to 0. """ chi_atom_indices = [] for residue_name in residue_constants.restypes: residue_name = residue_constants.restype_1to3[residue_name] residue_chi_angles = residue_constants.chi_angles_atoms[residue_name] atom_indices = [] for chi_angle in residue_chi_angles: atom_indices.append( [residue_constants.atom_order[atom] for atom in chi_angle]) for _ in range(4 - len(atom_indices)): atom_indices.append( [0, 0, 0, 0]) # For chi angles not defined on the AA. chi_atom_indices.append(atom_indices) chi_atom_indices.append([[0, 0, 0, 0]] * 4) # For UNKNOWN residue. return paddle.to_tensor(chi_atom_indices) def atom37_to_torsion_angles( aatype: paddle.Tensor, # (B, T, N) all_atom_pos: paddle.Tensor, # (B, T, N, 37, 3) all_atom_mask: paddle.Tensor, # (B, T, N, 37) placeholder_for_undefined=False, ) -> Dict[str, paddle.Tensor]: """Computes the 7 torsion angles (in sin, cos encoding) for each residue. The 7 torsion angles are in the order '[pre_omega, phi, psi, chi_1, chi_2, chi_3, chi_4]', here pre_omega denotes the omega torsion angle between the given amino acid and the previous amino acid. Args: aatype: Amino acid type, given as array with integers. all_atom_pos: atom37 representation of all atom coordinates. all_atom_mask: atom37 representation of mask on all atom coordinates. placeholder_for_undefined: flag denoting whether to set masked torsion angles to zero. Returns: Dict containing: * 'torsion_angles_sin_cos': Array with shape (B, N, 7, 2) where the final 2 dimensions denote sin and cos respectively * 'alt_torsion_angles_sin_cos': same as 'torsion_angles_sin_cos', but with the angle shifted by pi for all chi angles affected by the naming ambiguities. * 'torsion_angles_mask': Mask for which chi angles are present. """ # Map aatype > 20 to 'Unknown' (20). aatype = paddle.minimum( aatype.astype('int'), paddle.full( shape=[1], fill_value=20, dtype='int')) num_batch, num_temp, num_res = aatype.shape # Compute the backbone angles. pad = paddle.zeros([num_batch, num_temp, 1, 37, 3]) prev_all_atom_pos = paddle.concat( [pad, all_atom_pos[..., :-1, :, :]], axis=-3) pad = paddle.zeros([num_batch, num_temp, 1, 37]) prev_all_atom_mask = paddle.concat( [pad, all_atom_mask[..., :-1, :]], axis=-2) # For each torsion angle collect the 4 atom positions that define this angle. # shape (B, T, N, atoms=4, xyz=3) pre_omega_atom_pos = paddle.concat( [ prev_all_atom_pos[..., 1:3, :], # prev CA, C all_atom_pos[..., 0:2, :] # this N, CA ], axis=-2) phi_atom_pos = paddle.concat( [ prev_all_atom_pos[..., 2:3, :], # prev C all_atom_pos[..., 0:3, :] # this N, CA, C ], axis=-2) psi_atom_pos = paddle.concat( [ all_atom_pos[..., 0:3, :], # this N, CA, C all_atom_pos[..., 4:5, :] # this O ], axis=-2) # Collect the masks from these atoms. # Shape [batch, n_temp, num_res] pre_omega_mask = ( paddle.prod( prev_all_atom_mask[..., 1:3], axis=-1) # prev CA, C * paddle.prod( all_atom_mask[..., 0:2], axis=-1)) # this N, CA phi_mask = ( prev_all_atom_mask[..., 2] # prev C * paddle.prod( all_atom_mask[..., 0:3], axis=-1)) # this N, CA, C psi_mask = ( paddle.prod( all_atom_mask[..., 0:3], axis=-1) * # this N, CA, C all_atom_mask[..., 4]) # this O # Collect the atoms for the chi-angles. # Compute the table of chi angle indices. Shape: [restypes, chis=4, atoms=4]. chi_atom_indices = get_chi_atom_indices() # Select atoms to compute chis. Shape: [batch, num_temp, num_res, chis=4, atoms=4]. atom_indices = batched_gather( params=chi_atom_indices, indices=aatype, axis=0, batch_dims=0) # Gather atom positions. Shape: [batch, num_temp, num_res, chis=4, atoms=4, xyz=3]. chis_atom_pos = batched_gather( params=all_atom_pos, indices=atom_indices, axis=0, batch_dims=3) # Copy the chi angle mask, add the UNKNOWN residue. Shape: [restypes, 4]. chi_angles_mask = list(residue_constants.chi_angles_mask) chi_angles_mask.append([0.0, 0.0, 0.0, 0.0]) chi_angles_mask = paddle.to_tensor(chi_angles_mask) # Compute the chi angle mask. I.e. which chis angles exist according to the # aatype. Shape [batch, num_temp, num_res, chis=4]. chis_mask = batched_gather( params=chi_angles_mask, indices=aatype, axis=0, batch_dims=0) # Constrain the chis_mask to those chis, where the ground truth coordinates of # all defining four atoms are available. # Gather the chi angle atoms mask. Shape: [batch, num_temp, num_res, chis=4, atoms=4]. chi_angle_atoms_mask = batched_gather( params=all_atom_mask, indices=atom_indices, axis=0, batch_dims=3) # Check if all 4 chi angle atoms were set. Shape: [batch, num_temp, num_res, chis=4]. chi_angle_atoms_mask = paddle.prod(chi_angle_atoms_mask, axis=[-1]) chis_mask = chis_mask * chi_angle_atoms_mask # Stack all torsion angle atom positions. # Shape (B, T, N, torsions=7, atoms=4, xyz=3) torsions_atom_pos = paddle.concat( [ pre_omega_atom_pos.unsqueeze(axis=-3), # [:, :, :, None, :, :] phi_atom_pos.unsqueeze(axis=-3), # [:, :, :, None, :, :] psi_atom_pos.unsqueeze(axis=-3), # [:, :, :, None, :, :] chis_atom_pos ], axis=3) # Stack up masks for all torsion angles. # shape (B, T, N, torsions=7) torsion_angles_mask = paddle.concat( [ pre_omega_mask.unsqueeze(axis=-1), # [..., None] phi_mask.unsqueeze(axis=-1), # [..., None] psi_mask.unsqueeze(axis=-1), # [..., None] chis_mask ], axis=-1) # Create a frame from the first three atoms: # First atom: point on x-y-plane # Second atom: point on negative x-axis # Third atom: origin # r3.Rigids (B, T, N, torsions=7) torsion_frames = r3.rigids_from_3_points_vecs( point_on_neg_x_axis=r3.Vecs(torsions_atom_pos[..., 1, :]), origin=r3.Vecs(torsions_atom_pos[..., 2, :]), point_on_xy_plane=r3.Vecs(torsions_atom_pos[..., 0, :])) # Compute the position of the forth atom in this frame (y and z coordinate # define the chi angle) # r3.Vecs (B, T, N, torsions=7) forth_atom_rel_pos = r3.rigids_mul_vecs( r3.invert_rigids(torsion_frames), r3.vecs_from_tensor(torsions_atom_pos[..., 3, :])) # Normalize to have the sin and cos of the torsion angle. # paddle.Tensor (B, T, N, torsions=7, sincos=2) torsion_angles_sin_cos = paddle.stack( [forth_atom_rel_pos.z, forth_atom_rel_pos.y], axis=-1) torsion_angles_sin_cos /= paddle.sqrt( paddle.sum(paddle.square(torsion_angles_sin_cos), axis=-1, keepdim=True) + 1e-8) # Mirror psi, because we computed it from the Oxygen-atom. torsion_angles_sin_cos *= paddle.to_tensor( [1., 1., -1., 1., 1., 1., 1.]).reshape( [1, 1, 1, 7, 1]) # [None, None, None, :, None] # Create alternative angles for ambiguous atom names. chi_is_ambiguous = batched_gather( paddle.to_tensor(residue_constants.chi_pi_periodic), aatype) # chi_is_ambiguous (B, T, N, torsions=4) mirror_torsion_angles = paddle.concat( [ paddle.ones([num_batch, num_temp, num_res, 3]), 1.0 - 2.0 * chi_is_ambiguous ], axis=-1) # mirror_torsion_angles (B, T, N, torsions=7) alt_torsion_angles_sin_cos = torsion_angles_sin_cos * mirror_torsion_angles.unsqueeze( axis=-1) # [:, :, :, :, None] if placeholder_for_undefined: # Add placeholder torsions in place of undefined torsion angles # (e.g. N-terminus pre-omega) placeholder_torsions = paddle.stack( [ paddle.ones(torsion_angles_sin_cos.shape[:-1]), paddle.zeros(torsion_angles_sin_cos.shape[:-1]) ], axis=-1) torsion_angles_sin_cos = torsion_angles_sin_cos * torsion_angles_mask.unsqueeze( axis=-1) + placeholder_torsions * ( 1 - torsion_angles_mask.unsqueeze(axis=-1)) alt_torsion_angles_sin_cos = alt_torsion_angles_sin_cos * torsion_angles_mask.unsqueeze( axis=-1) + placeholder_torsions * ( 1 - torsion_angles_mask.unsqueeze(axis=-1)) return { 'torsion_angles_sin_cos': torsion_angles_sin_cos, # (B, T, N, 7, 2) 'alt_torsion_angles_sin_cos': alt_torsion_angles_sin_cos, # (B, T, N, 7, 2) 'torsion_angles_mask': torsion_angles_mask # (B, T, N, 7) } ================================================ FILE: ppfleetx/models/protein_folding/attentions.py ================================================ """attentions.py.""" # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import gc import numpy as np import paddle import paddle.nn as nn try: from paddle import _legacy_C_ops as _C_ops except: from paddle import _C_ops from ppfleetx.distributed.protein_folding import dap from .common import ( init_gate_linear, init_final_linear, mask_mean, subbatch, ) class Attention(nn.Layer): """Multihead attention.""" def __init__(self, config, global_config, q_dim, kv_dim, output_dim): super(Attention, self).__init__() self.config = config self.global_config = global_config num_head = self.config.num_head key_dim = self.config.get('key_dim', q_dim) value_dim = self.config.get('value_dim', kv_dim) # TODO(GuoxiaWang): delete non fuse_attention related code on dcu self.fuse_attention = self.global_config.fuse_attention self.use_flash_attn = self.global_config.use_flash_attn self.merge_qkv = (q_dim == kv_dim) assert key_dim % num_head == 0 assert value_dim % num_head == 0 key_dim = key_dim // num_head value_dim = value_dim // num_head self.key_dim = key_dim self.value_dim = value_dim self.qkv_w = None self.query_w = None self.key_w = None self.value_w = None if self.merge_qkv and self.fuse_attention: self.qkv_w = paddle.create_parameter( [3, num_head, key_dim, q_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) else: self.query_w = paddle.create_parameter( [q_dim, num_head, key_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) self.key_w = paddle.create_parameter( [kv_dim, num_head, key_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) self.value_w = paddle.create_parameter( [kv_dim, num_head, value_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) self.gating_w = None self.gating_b = None if self.config.gating: self.gating_w = paddle.create_parameter( [q_dim, num_head, value_dim], 'float32', default_initializer=nn.initializer.Constant(0.0)) self.gating_b = paddle.create_parameter( [num_head, value_dim], 'float32', default_initializer=nn.initializer.Constant(1.0)) if self.global_config.zero_init: init = nn.initializer.Constant(0.0) else: init = nn.initializer.XavierUniform() self.output_w = paddle.create_parameter( [num_head, value_dim, output_dim], 'float32', default_initializer=init) self.output_b = paddle.create_parameter( [output_dim], 'float32', default_initializer=nn.initializer.Constant(0.0)) def forward(self, q_data, m_data, bias, nonbatched_bias=None): """Builds Attention module. Args: q_data (float): A tensor of queries, shape [batch, row_size, N_queries, q_channels]. m_data (float): A tensor of memories from which the keys and values are projected, shape [batch, row_size, N_keys, m_channels]. bias (float): A bias for the attention, shape [batch, row_size, num_head, N_queries, N_keys]. nonbatched_bias (float): Shared bias, shape [N_queries, N_keys]. Returns: A float32 tensor of shape [batch_size, row_size, N_queries, output_dim]. """ if self.fuse_attention: if nonbatched_bias is not None: nonbatched_bias = paddle.unsqueeze(nonbatched_bias, axis=1) import paddle.incubate.nn.functional as F output = F.fused_gate_attention( query=q_data, key=m_data, query_weight=self.query_w, key_weight=self.key_w, value_weight=self.value_w, qkv_weight=self.qkv_w, gate_linear_weight=self.gating_w, gate_linear_bias=self.gating_b, out_linear_weight=self.output_w, out_linear_bias=self.output_b, nonbatched_bias=nonbatched_bias, attn_mask=bias, has_gating=self.config.gating, merge_qkv=self.merge_qkv, use_flash_attn=self.use_flash_attn, ) else: c = self.key_dim**(-0.5) q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) * c k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w) v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w) logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k) + bias if nonbatched_bias is not None: logits += paddle.unsqueeze(nonbatched_bias, axis=1) weights = nn.functional.softmax(logits) weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v) if self.config.gating: gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data, self.gating_w) + self.gating_b gate_values = nn.functional.sigmoid(gate_values) weighted_avg *= gate_values output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg, self.output_w) + self.output_b return output class GlobalAttention(nn.Layer): """Global attention. Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention" lines 2-7 """ def __init__(self, config, global_config, q_dim, kv_dim, output_dim): super(GlobalAttention, self).__init__() self.config = config self.global_config = global_config num_head = self.config.num_head key_dim = self.config.get('key_dim', q_dim) value_dim = self.config.get('value_dim', kv_dim) assert key_dim % num_head == 0 assert value_dim % num_head == 0 key_dim = key_dim // num_head value_dim = value_dim // num_head self.key_dim = key_dim self.value_dim = value_dim self.query_w = paddle.create_parameter( [q_dim, num_head, key_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) self.key_w = paddle.create_parameter( [kv_dim, key_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) self.value_w = paddle.create_parameter( [kv_dim, value_dim], 'float32', default_initializer=nn.initializer.XavierUniform()) if self.config.gating: self.gating_w = paddle.create_parameter( [q_dim, num_head, value_dim], 'float32', default_initializer=nn.initializer.Constant(0.0)) self.gating_b = paddle.create_parameter( [num_head, value_dim], 'float32', default_initializer=nn.initializer.Constant(1.0)) if self.global_config.zero_init: init = nn.initializer.Constant(0.0) else: init = nn.initializer.XavierUniform() self.output_w = paddle.create_parameter( [num_head, value_dim, output_dim], 'float32', default_initializer=init) self.output_b = paddle.create_parameter( [output_dim], 'float32', default_initializer=nn.initializer.Constant(0.0)) def forward(self, q_data, m_data, q_mask): """Builds Attention module. Args: q_data (float): A tensor of queries, shape [batch, row_size, N_queries, q_channels]. m_data (float): A tensor of memories from which the keys and values are projected, shape [batch, row_size, N_keys, m_channels]. q_mask (float): A tensor of mask. Returns: A float32 tensor of output. """ k = paddle.einsum('nbka,ac->nbkc', m_data, self.key_w) v = paddle.einsum('nbka,ac->nbkc', m_data, self.value_w) # NOTE: differ from non-global version using q_avg for attn q_avg = mask_mean(q_mask, q_data, axis=2) c = self.key_dim**(-0.5) q = paddle.einsum('nba,ahc->nbhc', q_avg, self.query_w) * c q_mask_ = paddle.unsqueeze(q_mask, axis=2)[..., 0] bias = 1e9 * (q_mask_ - 1.) logits = paddle.einsum('nbhc,nbkc->nbhk', q, k) + bias weights = nn.functional.softmax(logits) weighted_avg = paddle.einsum('nbhk,nbkc->nbhc', weights, v) if self.config.gating: gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data, self.gating_w) + self.gating_b gate_values = nn.functional.sigmoid(gate_values) weighted_avg = paddle.unsqueeze(weighted_avg, axis=2) weighted_avg *= gate_values output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg, self.output_w) + self.output_b else: output = paddle.einsum('nbhc,hco->nbo', weighted_avg, self.output_w) + self.output_b output = paddle.unsqueeze(output, axis=-1) return output class MSARowAttentionWithPairBias(nn.Layer): """MSA per-row attention biased by the pair representation. Jumper et al. (2021) Suppl. Alg. 7 "MSARowAttentionWithPairBias" """ def __init__(self, channel_num, config, global_config, is_extra_msa): super(MSARowAttentionWithPairBias, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config self.is_extra_msa = is_extra_msa assert config.orientation == 'per_row' if is_extra_msa: self.query_norm = nn.LayerNorm(channel_num['extra_msa_channel']) else: self.query_norm = nn.LayerNorm(channel_num['msa_channel']) self.feat_2d_norm = nn.LayerNorm(channel_num['pair_channel']) self.feat_2d_weights = paddle.create_parameter( [channel_num['pair_channel'], self.config.num_head], 'float32', default_initializer=nn.initializer.Normal( std=1. / np.sqrt(channel_num['pair_channel']))) if is_extra_msa: extra_msa_channel = channel_num['extra_msa_channel'] self.attention = Attention(self.config, self.global_config, extra_msa_channel, extra_msa_channel, extra_msa_channel) else: msa_channel = channel_num['msa_channel'] self.attention = Attention(self.config, self.global_config, msa_channel, msa_channel, msa_channel) def forward(self, msa_act, msa_mask, pair_act): """MSARowAttention with masks. Args: msa_act (float): A tensor of msa_act. msa_mask (float): A tensor of msa_mask. pair_act (float): A tensor of pair_act. Returns: A float32 tensor of msa_act. """ pair_act = self.feat_2d_norm(pair_act) # [B, N_res//dap_size, N_res, cz], [cz, head] => [B, head, N_res//dap_size, N_res] nonbatched_bias_before = paddle.einsum('nqkc,ch->nhqk', pair_act, self.feat_2d_weights) # [B, head, N_res//dap_size, N_res] => [B, head, N_res, N_res] nonbatched_bias = dap.all_gather(nonbatched_bias_before, axis=2) # if not self.training: if not self.training and self.global_config.low_memory is True: del nonbatched_bias_before gc.collect() nonbatched_bias = dap.all_gather_opp(nonbatched_bias, axis=2) # [B, N_seq, N_res] => [B, N_seq//dap_size, N_res] msa_mask = dap.scatter(msa_mask, axis=1) bias = 1e9 * (msa_mask - 1.) # [B, N_seq//dap_size, N_res] => [B, N_seq//dap_size, 1, 1, N_res] bias = paddle.unsqueeze(bias, axis=[2, 3]) msa_act = self.query_norm(msa_act) if not self.training or (self.is_extra_msa and self.config.use_subbatch): # low memory mode using subbatch subbatch_size = self.config.subbatch_size if not self.training: subbatch_size = self.global_config.subbatch_size sb_attn = subbatch( self.attention, [0, 1, 2], [1, 1, 1], subbatch_size, 1, same_arg_idx={1: 0}) msa_act = sb_attn(msa_act, msa_act, bias, nonbatched_bias) else: msa_act = self.attention(msa_act, msa_act, bias, nonbatched_bias) return msa_act class MSAColumnGlobalAttention(nn.Layer): """MSA per-column global attention. Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention" """ def __init__(self, channel_num, config, global_config): super(MSAColumnGlobalAttention, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config assert config.orientation == 'per_column' extra_msa_channel = channel_num['extra_msa_channel'] self.query_norm = nn.LayerNorm(extra_msa_channel) self.attention = GlobalAttention(self.config, self.global_config, extra_msa_channel, extra_msa_channel, extra_msa_channel) def forward(self, msa_act, msa_mask): """MSAColumnGlobalAttention. Args: msa_act (float): A tensor of msa_act. msa_mask (float): A tensor of msa_mask. Returns: A float32 tensor of msa_act. """ # scatter if using dap, otherwise do nothing # [B, N_seq, N_res] => [B, N_seq, N_res//dap_size] msa_mask = dap.scatter(msa_mask, axis=2) msa_act = paddle.transpose(msa_act, [0, 2, 1, 3]) msa_mask = paddle.transpose(msa_mask, [0, 2, 1]) bias = 1e9 * (msa_mask - 1.) bias = paddle.unsqueeze(bias, axis=[2, 3]) msa_mask = paddle.unsqueeze(msa_mask, axis=-1) msa_act = self.query_norm(msa_act) if not self.training: # low memory mode using subbatch sb_attn = subbatch( self.attention, [0, 1, 2], [1, 1, 1], self.global_config.subbatch_size, 1, same_arg_idx={1: 0}) msa_act = sb_attn(msa_act, msa_act, msa_mask) else: msa_act = self.attention(msa_act, msa_act, msa_mask) msa_act = paddle.transpose(msa_act, [0, 2, 1, 3]) return msa_act class MSAColumnAttention(nn.Layer): """MSA per-column attention. Jumper et al. (2021) Suppl. Alg. 8 "MSAColumnAttention" """ def __init__(self, channel_num, config, global_config): super(MSAColumnAttention, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config assert config.orientation == 'per_column' msa_channel = channel_num['msa_channel'] self.query_norm = nn.LayerNorm(msa_channel) self.attention = Attention(self.config, self.global_config, msa_channel, msa_channel, msa_channel) def forward(self, msa_act, msa_mask): """MSAColumnAttention. Args: msa_act (float): A tensor of msa_act. msa_mask (float): A tensor of msa_mask. Returns: A float32 tensor of msa_act. """ # scatter if using dap, otherwise do nothing # [B, N_seq, N_res] => [B, N_seq, N_res//dap_size] msa_mask = dap.scatter(msa_mask, axis=2) msa_act = paddle.transpose(msa_act, [0, 2, 1, 3]) msa_mask = paddle.transpose(msa_mask, [0, 2, 1]) bias = 1e9 * (msa_mask - 1.) bias = paddle.unsqueeze(bias, axis=[2, 3]) msa_act = self.query_norm(msa_act) if not self.training: # low memory mode using subbatch sb_attn = subbatch( self.attention, [0, 1, 2], [1, 1, 1], self.global_config.subbatch_size, 1, same_arg_idx={1: 0}) msa_act = sb_attn(msa_act, msa_act, bias) else: msa_act = self.attention(msa_act, msa_act, bias) msa_act = paddle.transpose(msa_act, [0, 2, 1, 3]) return msa_act class TriangleAttention(nn.Layer): """Triangle Attention. Jumper et al. (2021) Suppl. Alg. 13 "TriangleAttentionStartingNode" Jumper et al. (2021) Suppl. Alg. 14 "TriangleAttentionEndingNode" """ def __init__(self, channel_num, config, global_config, name='triangle_attention'): super(TriangleAttention, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config assert config.orientation in ['per_row', 'per_column'] self.query_norm = nn.LayerNorm( channel_num['pair_channel'], name='query_norm') self.feat_2d_weights = paddle.create_parameter( [channel_num['pair_channel'], self.config.num_head], 'float32', default_initializer=nn.initializer.Normal( std=1. / np.sqrt(channel_num['pair_channel']))) self.attention = Attention( self.config, self.global_config, channel_num['pair_channel'], channel_num['pair_channel'], channel_num['pair_channel']) def forward(self, pair_act, pair_mask): """Builds TriangleAttention module. Args: pair_act (float): [batch, N_res, N_res, c_z] pair activations tensor pair_mask (float): [batch, N_res, N_res] mask of non-padded regions in the tensor. Returns: Update to pair_act, shape [batch, N_res, N_res, c_z]. """ if self.config.orientation == 'per_column': pair_act = pair_act.transpose([0, 2, 1, 3]) pair_mask = pair_mask.transpose([0, 2, 1]) # [B, N_res//dap_size, N_res] bias = 1e9 * (pair_mask - 1.) # [B, N_res//dap_size, 1, 1, N_res] bias = paddle.unsqueeze(bias, axis=[2, 3]) pair_act = self.query_norm(pair_act) # [B, N_res//dap_size, N_res, cz], [cz, head] => [B, head, N_res//dap_size, N_res] nonbatched_bias_before = paddle.einsum('bqkc,ch->bhqk', pair_act, self.feat_2d_weights) # # [B, head, N_res//dap_size, N_res] => [B, head, N_res, N_res] nonbatched_bias = dap.all_gather(nonbatched_bias_before, axis=2) # if not self.training: if not self.training and self.global_config.low_memory is True: del nonbatched_bias_before gc.collect() nonbatched_bias = dap.all_gather_opp(nonbatched_bias, axis=2) if not self.training: # low memory mode using subbatch sb_attn = subbatch( self.attention, [0, 1, 2], [1, 1, 1], self.global_config.subbatch_size, 1, same_arg_idx={1: 0}) pair_act = sb_attn(pair_act, pair_act, bias, nonbatched_bias) else: pair_act = self.attention(pair_act, pair_act, bias, nonbatched_bias) if self.config.orientation == 'per_column': pair_act = pair_act.transpose([0, 2, 1, 3]) return pair_act class TriangleMultiplication(nn.Layer): """Triangle multiplication layer ("outgoing" or "incoming"). Jumper et al. (2021) Suppl. Alg. 11 "TriangleMultiplicationOutgoing" Jumper et al. (2021) Suppl. Alg. 12 "TriangleMultiplicationIncoming" """ def __init__(self, channel_num, config, global_config, name='triangle_multiplication'): super(TriangleMultiplication, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear self.layer_norm_input = nn.LayerNorm( self.channel_num['pair_channel'], name='layer_norm_input') self.left_projection = Linear( self.channel_num['pair_channel'], self.config.num_intermediate_channel, name='left_projection') self.right_projection = Linear( self.channel_num['pair_channel'], self.config.num_intermediate_channel, name='right_projection') self.left_gate = Linear( self.channel_num['pair_channel'], self.config.num_intermediate_channel, name='left_gate') init_gate_linear(self.left_gate) self.right_gate = Linear( self.channel_num['pair_channel'], self.config.num_intermediate_channel, name='right_gate') init_gate_linear(self.right_gate) # line 4 self.center_layer_norm = nn.LayerNorm( self.config.num_intermediate_channel, name='center_layer_norm') self.output_projection = Linear( self.config.num_intermediate_channel, self.channel_num['pair_channel'], name='output_projection') init_final_linear(self.output_projection) # line 3 self.gating_linear = Linear( self.channel_num['pair_channel'], self.channel_num['pair_channel'], name='output_projection') init_gate_linear(self.gating_linear) def forward(self, act, mask): """Builds TriangleMultiplication module. Args: act (float): Pair activations, shape [batch, N_res, N_res, c_z] mask (float): Pair mask, shape [batch, N_res, N_res]. Returns: Outputs, same shape/type as act. """ # Outgoing [batch, N_res//dap_size, N_res] => [batch, N_res//dap_size, N_res, 1] # Incoming [batch, N_res, N_res//dap_size] => [batch, N_res, N_res//dap_size, 1] mask = paddle.unsqueeze(mask, axis=-1) # [batch, N_res, N_res, 1] # Outgoing [B, N_res//dap_size, N_res, c_z] # Incoming [B, N_res, N_res//dap_size, c_z] act = self.layer_norm_input(act) # line 1 # if not self.training: if not self.training and self.global_config.low_memory is True: # Note(GuoxiaWang): using inplace version to save memory(low_mem=True). left_proj_act = self.left_gate(act) left_proj_act.sigmoid_() left_proj_act.multiply_(self.left_projection(act)) left_proj_act.multiply_(mask) right_proj_act_before = self.right_gate(act) right_proj_act_before.sigmoid_() right_proj_act_before.multiply_(self.right_projection(act)) right_proj_act_before.multiply_(mask) else: # Outgoing [B, N_res//dap_size, N_res, c_z] => [B, N_res//dap_size, N_res, num_intermediate_channel] # Incoming [B, N_res, N_res//dap_size, c_z] => [B, N_res, N_res//dap_size, num_intermediate_channel] left_proj_act = mask * self.left_projection(act) right_proj_act = mask * self.right_projection(act) # Outgoing [B, N_res//dap_size, N_res, c_z] => [B, N_res//dap_size, N_res, num_intermediate_channel] # Incoming [B, N_res, N_res//dap_size, c_z] => [B, N_res, N_res//dap_size, num_intermediate_channel] left_gate_values = nn.functional.sigmoid(self.left_gate(act)) right_gate_values = nn.functional.sigmoid(self.right_gate(act)) # Outgoing [B, N_res//dap_size, N_res, num_intermediate_channel] # Incoming [B, N_res, N_res//dap_size, num_intermediate_channel] left_proj_act = left_proj_act * left_gate_values right_proj_act_before = right_proj_act * right_gate_values # "Outgoing" edges equation: 'ikc,jkc->ijc' # "Incoming" edges equation: 'kjc,kic->ijc' # Note on the Suppl. Alg. 11 & 12 notation: # For the "outgoing" edges, a = left_proj_act and b = right_proj_act # For the "incoming" edges, it's swapped: # b = left_proj_act and a = right_proj_act if self.config.equation == 'ikc,jkc->ijc': # Outgoing # [B, N_res//dap_size, N_res, num_intermediate_channel] => [B, N_res, N_res, num_intermediate_channel] right_proj_act = dap.all_gather(right_proj_act_before, axis=1) # if not self.training: if not self.training and self.global_config.low_memory is True: del right_proj_act_before gc.collect() elif self.config.equation == 'kjc,kic->ijc': # Incoming # [B, N_res, N_res//dap_size, num_intermediate_channel] => [B, N_res, N_res, num_intermediate_channel] right_proj_act = dap.all_gather(right_proj_act_before, axis=2) # if not self.training: if not self.training and self.global_config.low_memory is True: del right_proj_act_before gc.collect() else: raise ValueError('unknown equation.') # Outgoing [B, N_res//dap_size, N_res, c_z] # Incoming [B, N_res, N_res//dap_size, c_z] # if not self.training: if not self.training and self.global_config.low_memory is True: gate_values = self.gating_linear(act).sigmoid_() # line 3 else: gate_values = nn.functional.sigmoid( self.gating_linear(act)) # line 3 if self.config.equation == 'ikc,jkc->ijc': # Outgoing dim, out_idx = 1, 1 equation = 'bikc,bjkc->bijc' # [B, N_res, N_res, num_intermediate_channel] right_proj_act_after = dap.all_gather_opp(right_proj_act, axis=1) elif self.config.equation == 'kjc,kic->ijc': # Incoming dim, out_idx = 2, 2 equation = 'bkjc,bkic->bijc' # [B, N_res, N_res, num_intermediate_channel] right_proj_act_after = dap.all_gather_opp(right_proj_act, axis=2) else: raise ValueError('unknown equation.') if not self.training: einsum_fn = subbatch(paddle.einsum, [1], [dim], self.global_config.subbatch_size, out_idx) act = einsum_fn(equation, left_proj_act, right_proj_act_after) else: # Outgoing equation = 'bikc,bjkc->bijc' # [B, N_res//dap_size, N_res, num_intermediate_channel], [B, N_res, N_res, num_intermediate_channel] # => [B, N_res//dap_size, N_res, num_intermediate_channel] # Incoming equation = 'bkjc,bkic->bijc' # [B, N_res, N_res//dap_size, num_intermediate_channel], [B, N_res, N_res, num_intermediate_channel] # => [B, N_res, N_res//dap_size, num_intermediate_channel] act = paddle.einsum(equation, left_proj_act, right_proj_act_after) act = self.center_layer_norm(act) act = self.output_projection(act) act = act * gate_values return act ================================================ FILE: ppfleetx/models/protein_folding/common.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import functools import numbers import collections import paddle import paddle.nn as nn from paddle.distributed.fleet.utils import recompute try: from paddle import _legacy_C_ops as _C_ops except: from paddle import _C_ops def set_tensor_constant(tensor, constant): tensor.set_value(paddle.full_like(tensor, constant)) def init_gate_linear(linear): set_tensor_constant(linear.weight, 0) set_tensor_constant(linear.bias, 1) def init_final_linear(linear): set_tensor_constant(linear.weight, 0) def recompute_wrapper(func, *args, is_recompute=True): """Function wrapper for recompute""" if is_recompute: return recompute(func, *args) else: return func(*args) def subbatch(f, arg_idx, dim, bs, out_idx, same_arg_idx={}): """ Converts a function to one that applies to subbatch of an input dimension. Args: f(Callable): original function. arg_idx([int]): indices of the inputs to be subbatched. dim([int]): index of the dimension to be subbatched. bs(int): subbatch size. out_idx(int): index of the output dimension that needs stacking same_arg_idx(dict), optional: index of same arg mapping. e.g {1: 0} means arg[1] == arg[0], we assign _args[1] = _args[0] avoiding slice repeatly. Returns: converted function. """ @functools.wraps(f) def wrapper(*args, **kwargs): assert len(arg_idx) == len( dim ), f'Number of batching args and number of batching dims should match.' inps = [args[i] for i in arg_idx] dim_width = [inp.shape[d] for inp, d in zip(inps, dim)] assert len(set(dim_width)) == 1, f'Batch sizes should be kept equal.' inp_dim = {inp: d for inp, d in zip(inps, dim)} dim_width = dim_width[0] if dim_width < bs: return f(*args, **kwargs) outs = [] for slice_at in np.arange(0, dim_width, bs): _args = [] for i, inp in enumerate(args): if i in same_arg_idx: assert i > same_arg_idx[ i], f"expect i > same_arg_idx[i], but got i: {i} and same_arg_idx[i]: {same_arg_idx[i]}" _args.append(_args[same_arg_idx[i]]) elif i in arg_idx: inp = inp.slice([inp_dim[inp]], [slice_at], [slice_at + bs]) _args.append(inp) else: _args.append(inp) outs.append(f(*_args, **kwargs)) return paddle.concat(outs, out_idx) return wrapper def batched_gather(params, indices, axis=0, batch_dims=0): # Implement gather with batching, like tensorflow: # https://www.tensorflow.org/api_docs/python/tf/gather#batching # print(params.shape, indices.shape, axis) p, i = params, indices rank = len(p.shape) axis = (rank + axis) % rank # The stride of axis stride = p.shape[batch_dims + axis] if batch_dims == 0 and len(i.shape) == 1: return paddle.gather(p, i, axis=axis) elif batch_dims == 0: flat_i = i.reshape([-1]) gathered = paddle.gather(p, flat_i, axis=axis) shape = p.shape[:axis] + i.shape if axis < rank - 1: shape += params.shape[axis + 1:] return gathered.reshape(shape) b = batch_dims a = axis assert p.shape[:b] == i.shape[:b] bn = np.prod(p.shape[:b]) # Shift batch dimensions right to bundle with axis if a > 0: perm = list(range(rank)) perm = perm[b:(b + a)] + perm[:b] + perm[(b + a):] p = p.transpose(perm) # Merge params' batch+axis p = p.reshape(p.shape[:a] + [-1] + p.shape[(b + a + 1):]) # indices = [Batch..., Index...] # Expand the index values across batch elements strides = paddle.arange(bn, dtype="int64").unsqueeze(-1) * stride i = i.reshape([bn, -1]) flat_i = paddle.flatten(i + strides) # Do gather gathered = paddle.gather(p, flat_i, axis=axis) # Unbundle batch and index dimensions unbundled_shape = p.shape[:a] + indices.shape + p.shape[a + 1:] gathered = gathered.reshape(unbundled_shape) # Shift batch dimensions back to the left if a > 0: perm = list(range(len(unbundled_shape))) perm = perm[a:(a + b)] + perm[:a] + perm[(a + b):] gathered = gathered.transpose(perm) return gathered def mask_mean(mask, value, axis=None, drop_mask_channel=False, eps=1e-10): if drop_mask_channel: mask = mask[:, 0] mask_shape = mask.shape value_shape = value.shape assert len(mask_shape) == len(value_shape) if isinstance(axis, numbers.Integral): axis = [axis] elif axis is None: axis = list(range(len(mask_shape))) assert isinstance(axis, collections.abc.Iterable), \ 'axis needs to be either an iterable, integer or "None"' broadcast_factor = 1. for axis_ in axis: value_size = value_shape[axis_] mask_size = mask_shape[axis_] if mask_size == 1: broadcast_factor *= value_size else: assert mask_size == value_size return (paddle.sum(mask * value, axis=axis) / (paddle.sum(mask, axis=axis) * broadcast_factor + eps)) class Transition(nn.Layer): """Transition layer. Jumper et al. (2021) Suppl. Alg. 9 "MSATransition" Jumper et al. (2021) Suppl. Alg. 15 "PairTransition" """ def __init__(self, channel_num, config, global_config, is_extra_msa, transition_type): super(Transition, self).__init__() assert transition_type in ['msa_transition', 'pair_transition'] self.channel_num = channel_num self.config = config self.global_config = global_config self.is_extra_msa = is_extra_msa self.transition_type = transition_type Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear if transition_type == 'msa_transition' and is_extra_msa: in_dim = channel_num['extra_msa_channel'] elif transition_type == 'msa_transition' and not is_extra_msa: in_dim = channel_num['msa_channel'] elif transition_type == 'pair_transition': in_dim = channel_num['pair_channel'] self.input_layer_norm = nn.LayerNorm(in_dim) self.transition1 = Linear( in_dim, int(in_dim * self.config.num_intermediate_factor), weight_attr=paddle.ParamAttr( initializer=nn.initializer.KaimingNormal())) if self.global_config.zero_init: last_init = nn.initializer.Constant(0.0) else: last_init = nn.initializer.TruncatedNormal() self.transition2 = Linear( int(in_dim * self.config.num_intermediate_factor), in_dim, weight_attr=paddle.ParamAttr(initializer=last_init)) def forward(self, act, mask): act = self.input_layer_norm(act) def transition_module(x): x = self.transition1(x) x = nn.functional.relu(x) x = self.transition2(x) return x if not self.training: # low memory mode using subbatch sb_transition = subbatch(transition_module, [0], [1], self.global_config.subbatch_size, 1) act = sb_transition(act) else: act = transition_module(act) return act class Dropout(nn.Layer): def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None): super(Dropout, self).__init__() if not isinstance(p, (float, int)): raise TypeError("p argument should be a number") if p < 0 or p > 1: raise ValueError("p argument should between 0 and 1") mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer if mode not in ('downscale_in_infer', 'upscale_in_train'): raise ValueError( "mode argument should be 'downscale_in_infer' or 'upscale_in_train'" ) if axis and not isinstance(axis, (int, list, tuple)): raise TypeError("datatype of axis argument should be int or list") self.p = p self.axis = axis self.mode = mode self.name = name def forward(self, input): # fast return for p == 0 if self.p == 0: return input if self.axis == None: out = nn.functional.dropout( input, p=self.p, axis=self.axis, training=self.training, mode=self.mode, name=self.name) else: seed = None drop_axes = [self.axis] if isinstance(self.axis, int) else list(self.axis) if paddle.static.default_main_program().random_seed != 0: seed = paddle.static.default_main_program().random_seed out, mask = _C_ops.dropout_nd( input, 'dropout_prob', self.p, 'is_test', not self.training, 'fix_seed', seed is not None, 'seed', seed if seed is not None else 0, 'dropout_implementation', self.mode, 'axis', drop_axes) return out def extra_repr(self): name_str = ', name={}'.format(self.name) if self.name else '' return 'p={}, axis={}, mode={}{}'.format(self.p, self.axis, self.mode, name_str) def dgram_from_positions(positions, num_bins, min_bin, max_bin): lower_breaks = paddle.linspace(min_bin, max_bin, num_bins) lower_breaks = paddle.square(lower_breaks) upper_breaks = paddle.concat([ lower_breaks[1:], paddle.full( shape=[1], fill_value=1e8, dtype='float32') ]) def _squared_difference(x, y): return paddle.square(x - y) dist2 = paddle.sum(_squared_difference( paddle.unsqueeze( positions, axis=-2), paddle.unsqueeze( positions, axis=-3)), axis=-1, keepdim=True) dgram = ((dist2 > lower_breaks.astype(dist2.dtype)).astype('float32') * (dist2 < upper_breaks.astype(dist2.dtype)).astype('float32')) return dgram ================================================ FILE: ppfleetx/models/protein_folding/evoformer.py ================================================ """evoformer.py.""" # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import gc import paddle import paddle.nn as nn from ppfleetx.distributed.protein_folding import bp, dap from .attentions import ( MSARowAttentionWithPairBias, MSAColumnGlobalAttention, MSAColumnAttention, TriangleMultiplication, TriangleAttention, ) from .common import ( Transition, Dropout, recompute_wrapper, dgram_from_positions, ) from .template import (TemplateEmbedding, ) from .outer_product_mean import (OuterProductMean, ) from . import ( residue_constants, all_atom, ) class EvoformerIteration(nn.Layer): """Single iteration (block) of Evoformer stack. Jumper et al. (2021) Suppl. Alg. 6 "EvoformerStack" lines 2-10 """ def __init__(self, channel_num, config, global_config, is_extra_msa=False): super(EvoformerIteration, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config self.is_extra_msa = is_extra_msa assert self.global_config.outer_product_mean_position in [ 'origin', 'middle', 'first', 'end' ] # Row-wise Gated Self-attention with Pair Bias self.msa_row_attention_with_pair_bias = MSARowAttentionWithPairBias( channel_num, self.config.msa_row_attention_with_pair_bias, self.global_config, is_extra_msa) dropout_rate, dropout_axis = self._parse_dropout_params( self.msa_row_attention_with_pair_bias) self.msa_row_attn_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) if self.is_extra_msa: self.msa_column_global_attention = MSAColumnGlobalAttention( channel_num, config.msa_column_attention, global_config) dropout_rate, dropout_axis = self._parse_dropout_params( self.msa_column_global_attention) self.msa_col_attn_dropout = nn.Dropout( dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) else: self.msa_column_attention = MSAColumnAttention( channel_num, config.msa_column_attention, global_config) dropout_rate, dropout_axis = self._parse_dropout_params( self.msa_column_attention) self.msa_col_attn_dropout = nn.Dropout( dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.msa_transition = Transition( channel_num, self.config.msa_transition, self.global_config, is_extra_msa, 'msa_transition') dropout_rate, dropout_axis = self._parse_dropout_params( self.msa_transition) self.msa_transition_dropout = nn.Dropout( dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) # OuterProductMean self.outer_product_mean = OuterProductMean( channel_num, self.config.outer_product_mean, self.global_config, self.is_extra_msa, name='outer_product_mean') # Dropout dropout_rate, dropout_axis = self._parse_dropout_params( self.outer_product_mean) self.outer_product_mean_dropout = nn.Dropout( dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) # Triangle Multiplication. self.triangle_multiplication_outgoing = TriangleMultiplication( channel_num, self.config.triangle_multiplication_outgoing, self.global_config, name='triangle_multiplication_outgoing') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_multiplication_outgoing) self.triangle_outgoing_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.triangle_multiplication_incoming = TriangleMultiplication( channel_num, self.config.triangle_multiplication_incoming, self.global_config, name='triangle_multiplication_incoming') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_multiplication_incoming) self.triangle_incoming_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) # TriangleAttention. self.triangle_attention_starting_node = TriangleAttention( channel_num, self.config.triangle_attention_starting_node, self.global_config, name='triangle_attention_starting_node') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_attention_starting_node) self.triangle_starting_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.triangle_attention_ending_node = TriangleAttention( channel_num, self.config.triangle_attention_ending_node, self.global_config, name='triangle_attention_ending_node') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_attention_ending_node) self.triangle_ending_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) # Pair transition. self.pair_transition = Transition( channel_num, self.config.pair_transition, self.global_config, is_extra_msa, 'pair_transition') dropout_rate, dropout_axis = self._parse_dropout_params( self.pair_transition) self.pair_transition_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) def _parse_dropout_params(self, module): """tbd.""" dropout_rate = 0.0 if self.global_config.deterministic else \ module.config.dropout_rate dropout_axis = None if module.config.shared_dropout: dropout_axis = { 'per_row': [0, 2, 3], 'per_column': [0, 1, 3], }[module.config.orientation] return dropout_rate, dropout_axis def outer_product_mean_origin(self, msa_act, pair_act, masks): """tbd.""" assert bp.get_world_size( ) == 1, "Branch Parallel degree must be 1 for outer_product_mean_origin" msa_mask, pair_mask = masks['msa'], masks['pair'] # [B, N_seq//dap_size, N_res, c_m] residual = self.msa_row_attention_with_pair_bias(msa_act, msa_mask, pair_act) residual = self.msa_row_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m] msa_act = dap.row_to_col(msa_act) if self.is_extra_msa: # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_column_global_attention(msa_act, msa_mask) residual = self.msa_col_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_transition(msa_act, msa_mask) residual = self.msa_transition_dropout(residual) msa_act = msa_act + residual else: # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_column_attention(msa_act, msa_mask) residual = self.msa_col_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_transition(msa_act, msa_mask) residual = self.msa_transition_dropout(residual) msa_act = msa_act + residual # [B, N_res//dap_size, N_res, c_z] residual = self.outer_product_mean(msa_act, msa_mask) outer_product_mean = self.outer_product_mean_dropout(residual) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: pair_act.add_(outer_product_mean) del outer_product_mean gc.collect() else: pair_act = pair_act + outer_product_mean # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m] msa_act = dap.col_to_row(msa_act) # scatter if using dap, otherwise do nothing pair_mask_row = dap.scatter(pair_mask, axis=1) pair_mask_col = dap.scatter(pair_mask, axis=2) # [B, N_res//dap_size, N_res, c_z] # TODO(GuoxiaWang): why have diffrence whether remove pair_act = pair_act.clone() # pair_act = pair_act.clone() residual = self.triangle_multiplication_outgoing(pair_act, pair_mask_row) residual = self.triangle_outgoing_dropout(residual) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: pair_act.add_(residual) del residual gc.collect() else: pair_act = pair_act + residual # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z] pair_act = dap.row_to_col(pair_act) # [B, N_res, N_res//dap_size, c_z] residual = self.triangle_multiplication_incoming(pair_act, pair_mask_col) residual = self.triangle_incoming_dropout(residual) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: pair_act.add_(residual) del residual gc.collect() else: pair_act = pair_act + residual # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] pair_act = dap.col_to_row(pair_act) # [B, N_res//dap_size, N_res, c_z] residual = self.triangle_attention_starting_node(pair_act, pair_mask_row) residual = self.triangle_starting_dropout(residual) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: pair_act.add_(residual) del residual gc.collect() else: pair_act = pair_act + residual # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z] pair_act = dap.row_to_col(pair_act) # [B, N_res, N_res//dap_size, c_z] residual = self.triangle_attention_ending_node(pair_act, pair_mask_col) residual = self.triangle_ending_dropout(residual) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: pair_act.add_(residual) del residual gc.collect() else: pair_act = pair_act + residual residual = self.pair_transition(pair_act, pair_mask) residual = self.pair_transition_dropout(residual) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: pair_act.add_(residual) del residual gc.collect() else: pair_act = pair_act + residual # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] pair_act = dap.col_to_row(pair_act) return msa_act, pair_act def outer_product_mean_first(self, msa_act, pair_act, masks): """tbd.""" raise NotImplementedError( "BP or DAP does not support outer_product_mean_first") def outer_product_mean_end(self, msa_act, pair_act, masks): """tbd.""" msa_mask, pair_mask = masks['msa'], masks['pair'] if bp.get_world_size() > 1: # Note(GuoxiaWang): add zeros trigger the status of stop_gradient=False within recompute context. pair_act = pair_act + paddle.zeros_like(pair_act) # Note(GuoxiaWang): reduce the pair_act's gradient from msa branch and pair branch if not pair_act.stop_gradient: pair_act._register_grad_hook(bp.all_reduce) if bp.get_rank_in_group() == 0: # [B, N_seq//dap_size, N_res, c_m] residual = self.msa_row_attention_with_pair_bias( msa_act, msa_mask, pair_act) residual = self.msa_row_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m] msa_act = dap.row_to_col(msa_act) if self.is_extra_msa: # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_column_global_attention(msa_act, msa_mask) residual = self.msa_col_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_transition(msa_act, msa_mask) residual = self.msa_transition_dropout(residual) msa_act = msa_act + residual else: # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_column_attention(msa_act, msa_mask) residual = self.msa_col_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_transition(msa_act, msa_mask) residual = self.msa_transition_dropout(residual) msa_act = msa_act + residual # [B, N_res//dap_size, N_res, c_z] residual = self.outer_product_mean(msa_act, msa_mask) outer_product_mean = self.outer_product_mean_dropout(residual) # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m] msa_act = dap.col_to_row(msa_act) if bp.get_rank_in_group() == 1: # scatter if using dap, otherwise do nothing pair_mask_row = dap.scatter(pair_mask, axis=1) pair_mask_col = dap.scatter(pair_mask, axis=2) # [B, N_res//dap_size, N_res, c_z] residual = self.triangle_multiplication_outgoing(pair_act, pair_mask_row) residual = self.triangle_outgoing_dropout(residual) pair_act = pair_act + residual # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z] pair_act = dap.row_to_col(pair_act) # [B, N_res, N_res//dap_size, c_z] residual = self.triangle_multiplication_incoming(pair_act, pair_mask_col) residual = self.triangle_incoming_dropout(residual) pair_act = pair_act + residual # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] pair_act = dap.col_to_row(pair_act) # [B, N_res//dap_size, N_res, c_z] residual = self.triangle_attention_starting_node(pair_act, pair_mask_row) residual = self.triangle_starting_dropout(residual) pair_act = pair_act + residual # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z] pair_act = dap.row_to_col(pair_act) # [B, N_res, N_res//dap_size, c_z] residual = self.triangle_attention_ending_node(pair_act, pair_mask_col) residual = self.triangle_ending_dropout(residual) pair_act = pair_act + residual residual = self.pair_transition(pair_act, pair_mask) residual = self.pair_transition_dropout(residual) pair_act = pair_act + residual # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] pair_act = dap.col_to_row(pair_act) outer_product_mean = paddle.zeros_like(pair_act) outer_product_mean.stop_gradient = pair_act.stop_gradient # TODO(GuoxiaWang): fix PyLayer ctx illegal access msa_act = paddle.assign(msa_act) pair_act = paddle.assign(pair_act) msa_act, pair_act = bp.sync_evoformer_results(outer_product_mean, msa_act, pair_act) # TODO(GuoxiaWang): fix PyLayer ctx illegal access pair_act = paddle.assign(pair_act) return msa_act, pair_act else: # [B, N_seq//dap_size, N_res, c_m] residual = self.msa_row_attention_with_pair_bias(msa_act, msa_mask, pair_act) residual = self.msa_row_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res//dap_size, c_m] msa_act = dap.row_to_col(msa_act) if self.is_extra_msa: # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_column_global_attention(msa_act, msa_mask) residual = self.msa_col_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_transition(msa_act, msa_mask) residual = self.msa_transition_dropout(residual) msa_act = msa_act + residual else: # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_column_attention(msa_act, msa_mask) residual = self.msa_col_attn_dropout(residual) msa_act = msa_act + residual # [B, N_seq, N_res//dap_size, c_m] residual = self.msa_transition(msa_act, msa_mask) residual = self.msa_transition_dropout(residual) msa_act = msa_act + residual # [B, N_res//dap_size, N_res, c_z] residual = self.outer_product_mean(msa_act, msa_mask) outer_product_mean = self.outer_product_mean_dropout(residual) # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq//dap_size, N_res, c_m] msa_act = dap.col_to_row(msa_act) # scatter if using dap, otherwise do nothing pair_mask_row = dap.scatter(pair_mask, axis=1) pair_mask_col = dap.scatter(pair_mask, axis=2) # [B, N_res//dap_size, N_res, c_z] # TODO(GuoxiaWang): why have diffrence whether remove pair_act = pair_act.clone() # pair_act = pair_act.clone() residual = self.triangle_multiplication_outgoing(pair_act, pair_mask_row) residual = self.triangle_outgoing_dropout(residual) pair_act = pair_act + residual # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z] pair_act = dap.row_to_col(pair_act) # [B, N_res, N_res//dap_size, c_z] residual = self.triangle_multiplication_incoming(pair_act, pair_mask_col) residual = self.triangle_incoming_dropout(residual) pair_act = pair_act + residual # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] pair_act = dap.col_to_row(pair_act) # [B, N_res//dap_size, N_res, c_z] residual = self.triangle_attention_starting_node(pair_act, pair_mask_row) residual = self.triangle_starting_dropout(residual) pair_act = pair_act + residual # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res//dap_size, c_z] pair_act = dap.row_to_col(pair_act) # [B, N_res, N_res//dap_size, c_z] residual = self.triangle_attention_ending_node(pair_act, pair_mask_col) residual = self.triangle_ending_dropout(residual) pair_act = pair_act + residual residual = self.pair_transition(pair_act, pair_mask) residual = self.pair_transition_dropout(residual) pair_act = pair_act + residual # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] pair_act = dap.col_to_row(pair_act) pair_act = pair_act + outer_product_mean return msa_act, pair_act def forward(self, msa_act, pair_act, masks): """tbd.""" if self.global_config.outer_product_mean_position in [ 'origin', 'middle' ]: msa_act, pair_act = self.outer_product_mean_origin(msa_act, pair_act, masks) elif self.global_config.outer_product_mean_position == 'first': msa_act, pair_act = self.outer_product_mean_first(msa_act, pair_act, masks) elif self.global_config.outer_product_mean_position == 'end': msa_act, pair_act = self.outer_product_mean_end(msa_act, pair_act, masks) else: raise Error( "Only support outer_product_mean_position in ['origin', 'middle', ''first', 'end'] now!" ) return msa_act, pair_act class DistEmbeddingsAndEvoformer(nn.Layer): """Embeds the input data and runs Evoformer. Produces the MSA, single and pair representations. Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5-18 """ def __init__(self, channel_num, config, global_config): super(DistEmbeddingsAndEvoformer, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear # InputEmbedder # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5 # Jumper et al. (2021) Suppl. Alg. 3 "InputEmbedder" self.preprocess_1d = Linear( channel_num['target_feat'], self.config.msa_channel, name='preprocess_1d') self.preprocess_msa = Linear( channel_num['msa_feat'], self.config.msa_channel, name='preprocess_msa') self.left_single = Linear( channel_num['target_feat'], self.config.pair_channel, name='left_single') self.right_single = Linear( channel_num['target_feat'], self.config.pair_channel, name='right_single') # RecyclingEmbedder # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 6 # Jumper et al. (2021) Suppl. Alg. 32 "RecyclingEmbedder" if self.config.recycle_pos: self.prev_pos_linear = Linear(self.config.prev_pos.num_bins, self.config.pair_channel) # RelPosEmbedder # Jumper et al. (2021) Suppl. Alg. 4 "relpos" # Jumper et al. (2021) Suppl. Alg. 5 "one_hot" if self.config.max_relative_feature: self.pair_activiations = Linear( 2 * self.config.max_relative_feature + 1, self.config.pair_channel) if self.config.recycle_features: self.prev_msa_first_row_norm = nn.LayerNorm( self.config.msa_channel) self.prev_pair_norm = nn.LayerNorm(self.config.pair_channel) # Embed templates into the pair activations. # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-13 if self.config.template.enabled: self.channel_num['template_angle'] = 57 self.channel_num['template_pair'] = 88 self.template_embedding = TemplateEmbedding( self.channel_num, self.config.template, self.global_config) # ExtraMSAEmbedder # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 14-16 self.extra_msa_activations = Linear( 25, # 23 (20aa+unknown+gap+mask) + 1 (has_del) + 1 (del_val) self.config.extra_msa_channel) # Extra MSA Stack. # Jumper et al. (2021) Suppl. Alg. 18 "ExtraMsaStack" self.extra_msa_stack = nn.LayerList() for _ in range(self.config.extra_msa_stack_num_block): self.extra_msa_stack.append( EvoformerIteration( self.channel_num, self.config.evoformer, self.global_config, is_extra_msa=True)) # Embed templates torsion angles if self.config.template.enabled and self.config.template.embed_torsion_angles: c = self.config.msa_channel self.template_single_embedding = Linear( self.channel_num['template_angle'], c) self.template_projection = Linear(c, c) # Main trunk of the network # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 17-18 self.evoformer_iteration = nn.LayerList() for _ in range(self.config.evoformer_num_block): self.evoformer_iteration.append( EvoformerIteration( self.channel_num, self.config.evoformer, self.global_config, is_extra_msa=False)) self.single_activations = Linear(self.config.msa_channel, self.config.seq_channel) def _pseudo_beta_fn(self, aatype, all_atom_positions, all_atom_masks): """tbd.""" gly_id = paddle.ones_like(aatype) * residue_constants.restype_order[ 'G'] is_gly = paddle.equal(aatype, gly_id) ca_idx = residue_constants.atom_order['CA'] cb_idx = residue_constants.atom_order['CB'] n = len(all_atom_positions.shape) pseudo_beta = paddle.where( paddle.tile( paddle.unsqueeze( is_gly, axis=-1), [1] * len(is_gly.shape) + [3]), paddle.squeeze( all_atom_positions.slice([n - 2], [ca_idx], [ca_idx + 1]), axis=-2), paddle.squeeze( all_atom_positions.slice([n - 2], [cb_idx], [cb_idx + 1]), axis=-2)) if all_atom_masks is not None: m = len(all_atom_masks) pseudo_beta_mask = paddle.where( is_gly, paddle.squeeze( all_atom_masks.slice([m - 1], [ca_idx], [ca_idx + 1]), axis=-1), paddle.squeeze( all_atom_masks.slice([m - 1], [cb_idx], [cb_idx + 1]), axis=-1)) pseudo_beta_mask = paddle.squeeze(pseudo_beta_mask, axis=-1) return pseudo_beta, pseudo_beta_mask else: return pseudo_beta def _create_extra_msa_feature(self, batch): """tbd.""" # 23: 20aa + unknown + gap + bert mask msa_1hot = nn.functional.one_hot(batch['extra_msa'], 23) msa_feat = [ msa_1hot, paddle.unsqueeze( batch['extra_has_deletion'], axis=-1), paddle.unsqueeze( batch['extra_deletion_value'], axis=-1) ] return paddle.concat(msa_feat, axis=-1) def forward(self, batch): """tbd.""" # InputEmbedder # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5 # Jumper et al. (2021) Suppl. Alg. 3 "InputEmbedder" preprocess_1d = self.preprocess_1d(batch['target_feat']) # preprocess_msa = self.preprocess_msa(batch['msa_feat']) msa_activations = paddle.unsqueeze(preprocess_1d, axis=1) + \ self.preprocess_msa(batch['msa_feat']) right_single = self.right_single( batch['target_feat']) # 1, n_res, 22 -> 1, n_res, 128 right_single = paddle.unsqueeze( right_single, axis=1) # 1, n_res, 128 -> 1, 1, n_res, 128 left_single = self.left_single( batch['target_feat']) # 1, n_res, 22 -> 1, n_res, 128 left_single = paddle.unsqueeze( left_single, axis=2) # 1, n_res, 128 -> 1, n_res, 1, 128 pair_activations = left_single + right_single if not self.training and self.global_config.low_memory is True: del left_single del right_single gc.collect() # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z] pair_activations = dap.scatter(pair_activations, axis=1) mask_2d = paddle.unsqueeze( batch['seq_mask'], axis=1) * paddle.unsqueeze( batch['seq_mask'], axis=2) # Inject previous outputs for recycling. # Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 6 # Jumper et al. (2021) Suppl. Alg. 32 "RecyclingEmbedder" if self.config.recycle_pos and 'prev_pos' in batch: prev_pseudo_beta = self._pseudo_beta_fn(batch['aatype'], batch['prev_pos'], None) dgram = dgram_from_positions(prev_pseudo_beta, **self.config.prev_pos) if not self.training and self.global_config.low_memory is True: dgram = dap.scatter(dgram, axis=1) pair_activations += self.prev_pos_linear(dgram) del dgram del prev_pseudo_beta gc.collect() else: pair_activations += self.prev_pos_linear(dgram) if self.config.recycle_features: if 'prev_msa_first_row' in batch: prev_msa_first_row = self.prev_msa_first_row_norm(batch[ 'prev_msa_first_row']) # A workaround for `jax.ops.index_add` msa_first_row = paddle.squeeze( msa_activations[:, 0, :], axis=1) msa_first_row += prev_msa_first_row msa_first_row = paddle.unsqueeze(msa_first_row, axis=1) msa_activations = paddle.concat( [msa_first_row, msa_activations[:, 1:, :]], axis=1) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: del prev_msa_first_row del msa_first_row gc.collect() if 'prev_pair' in batch: # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: prev_pair = batch['prev_pair'] prev_pair_gpu = prev_pair.cuda() prev_pair_gpu = dap.scatter(prev_pair_gpu, axis=1) pair_activations += self.prev_pair_norm(prev_pair_gpu) del prev_pair_gpu gc.collect() else: pair_activations += self.prev_pair_norm(batch['prev_pair']) # RelPosEmbedder # Jumper et al. (2021) Suppl. Alg. 4 "relpos" # Jumper et al. (2021) Suppl. Alg. 5 "one_hot" if self.config.max_relative_feature: pos = batch['residue_index'] # [bs, N_res] offset = paddle.unsqueeze(pos, axis=[-1]) - \ paddle.unsqueeze(pos, axis=[-2]) rel_pos = nn.functional.one_hot( paddle.clip( offset + self.config.max_relative_feature, min=0, max=2 * self.config.max_relative_feature), 2 * self.config.max_relative_feature + 1) if not self.training and self.global_config.low_memory is True: rel_pos = dap.scatter(rel_pos, axis=1) rel_pos_bias = self.pair_activiations(rel_pos) pair_activations += rel_pos_bias del rel_pos del rel_pos_bias gc.collect() else: rel_pos_bias = self.pair_activiations(rel_pos) pair_activations += rel_pos_bias # TemplateEmbedder # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-13 if self.config.template.enabled: template_batch = { k: batch[k] for k in batch if k.startswith('template_') } template_pair_repr = self.template_embedding( pair_activations, template_batch, mask_2d) pair_activations += template_pair_repr # ExtraMSAEmbedder # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 14-16 extra_msa_feat = self._create_extra_msa_feature(batch) extra_msa_activations = self.extra_msa_activations(extra_msa_feat) # if not self.training: # for inference if not self.training and self.global_config.low_memory is True: del extra_msa_feat gc.collect() # ================================================== # Extra MSA Stack # Jumper et al. (2021) Suppl. Alg. 18 "ExtraMsaStack" # ================================================== if not self.training and self.global_config.low_memory is True: # scatter if using dap, otherwise do nothing # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m] extra_msa_activations = dap.scatter(extra_msa_activations, axis=1) # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m] msa_activations = dap.scatter(msa_activations, axis=1) extra_msa_stack_input = { 'msa': extra_msa_activations, 'pair': pair_activations, } if not self.training and self.global_config.low_memory is True: del pair_activations gc.collect() if bp.get_world_size() > 1: extra_msa_stack_input['msa'] = bp.broadcast_grad_for_backward( extra_msa_stack_input['msa'], 0) if not self.training and self.global_config.low_memory is True: pass else: # scatter if using dap, otherwise do nothing # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m] extra_msa_stack_input['msa'] = dap.scatter( extra_msa_stack_input['msa'], axis=1) # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z] extra_msa_stack_input['pair'] = dap.scatter( extra_msa_stack_input['pair'], axis=1) for idx, extra_msa_stack_iteration in enumerate(self.extra_msa_stack): extra_msa_act, extra_pair_act = recompute_wrapper( extra_msa_stack_iteration, extra_msa_stack_input['msa'], extra_msa_stack_input['pair'], {'msa': batch['extra_msa_mask'], 'pair': mask_2d}, is_recompute=self.training and idx >= self.config.extra_msa_stack_recompute_start_block_index) extra_msa_stack_output = { 'msa': extra_msa_act, 'pair': extra_pair_act } extra_msa_stack_input = { 'msa': extra_msa_stack_output['msa'], 'pair': extra_msa_stack_output['pair'] } if not self.training and self.global_config.low_memory is True: pass else: # gather if using dap, otherwise do nothing # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res, c_z] extra_msa_stack_output['pair'] = dap.gather( extra_msa_stack_output['pair'], axis=1) evoformer_input = { 'msa': msa_activations, 'pair': extra_msa_stack_output['pair'], } evoformer_masks = { 'msa': batch['msa_mask'], 'pair': mask_2d, } if not self.training and self.global_config.low_memory is True: del extra_msa_stack_input del extra_msa_stack_output gc.collect() # ================================================== # Template angle feat # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 7-8 # ================================================== if self.config.template.enabled and self.config.template.embed_torsion_angles: num_templ, num_res = batch['template_aatype'].shape[1:] aatype_one_hot = nn.functional.one_hot(batch['template_aatype'], 22) # Embed the templates aatype, torsion angles and masks. # Shape (templates, residues, msa_channels) ret = all_atom.atom37_to_torsion_angles( aatype=batch['template_aatype'], all_atom_pos=batch['template_all_atom_positions'], all_atom_mask=batch['template_all_atom_masks'], # Ensure consistent behaviour during testing: placeholder_for_undefined=not self.global_config.zero_init) template_features = paddle.concat( [ aatype_one_hot, paddle.reshape(ret['torsion_angles_sin_cos'], [-1, num_templ, num_res, 14]), paddle.reshape(ret['alt_torsion_angles_sin_cos'], [-1, num_templ, num_res, 14]), ret['torsion_angles_mask'] ], axis=-1) template_activations = self.template_single_embedding( template_features) template_activations = nn.functional.relu(template_activations) template_activations = self.template_projection( template_activations) # Concatenate the templates to the msa. evoformer_input['msa'] = paddle.concat( [evoformer_input['msa'], template_activations], axis=1) # Concatenate templates masks to the msa masks. # Use mask from the psi angle, as it only depends on the backbone atoms # from a single residue. torsion_angle_mask = ret['torsion_angles_mask'][..., 2] torsion_angle_mask = torsion_angle_mask.astype(evoformer_masks[ 'msa'].dtype) evoformer_masks['msa'] = paddle.concat( [evoformer_masks['msa'], torsion_angle_mask], axis=1) if bp.get_world_size() > 1: evoformer_input['msa'] = bp.broadcast_grad_for_backward( evoformer_input['msa'], 0) # if self.training: if not self.training and self.global_config.low_memory is True: pass else: # scatter if using dap, otherwise do nothing # [B, N_seq, N_res, c_m] => [B, N_seq//dap_size, N_res, c_m] evoformer_input['msa'] = dap.scatter( evoformer_input['msa'], axis=1) # [B, N_res, N_res, c_z] => [B, N_res//dap_size, N_res, c_z] evoformer_input['pair'] = dap.scatter( evoformer_input['pair'], axis=1) # ================================================== # Main MSA Stack # Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 17-18 # ================================================== for idx, evoformer_block in enumerate(self.evoformer_iteration): msa_act, pair_act = recompute_wrapper( evoformer_block, evoformer_input['msa'], evoformer_input['pair'], evoformer_masks, is_recompute=self.training and idx >= self.config.evoformer_recompute_start_block_index) evoformer_output = {'msa': msa_act, 'pair': pair_act} evoformer_input = { 'msa': evoformer_output['msa'], 'pair': evoformer_output['pair'], } # gather if using dap, otherwise do nothing # [B, N_seq//dap_size, N_res, c_m] => [B, N_seq, N_res, c_m] evoformer_output['msa'] = dap.gather(evoformer_output['msa'], axis=1) # [B, N_res//dap_size, N_res, c_z] => [B, N_res, N_res, c_z] evoformer_output['pair'] = dap.gather(evoformer_output['pair'], axis=1) msa_activations = evoformer_output['msa'] pair_activations = evoformer_output['pair'] if not self.training and self.global_config.low_memory is True: pair_activations_cpu = pair_activations.cpu() del pair_activations single_activations = self.single_activations(msa_activations[:, 0]) # if not self.training and self.global_config.low_memory is True: # pair_act_out = pair_activations_cpu # else: # pair_act_out = pair_activations num_seq = batch['msa_feat'].shape[1] output = { 'single': single_activations, 'pair': pair_activations_cpu if not self.training and self.global_config.low_memory is True else pair_activations, # Crop away template rows such that they are not used # in MaskedMsaHead. 'msa': msa_activations[:, :num_seq], 'msa_first_row': msa_activations[:, 0], } return output ================================================ FILE: ppfleetx/models/protein_folding/outer_product_mean.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from ppfleetx.distributed.protein_folding import dap from .common import subbatch class OuterProductMean(nn.Layer): """Computes mean outer product. Jumper et al. (2021) Suppl. Alg. 10 "OuterProductMean" """ def __init__(self, channel_num, config, global_config, is_extra_msa, name='outer_product_mean'): super(OuterProductMean, self).__init__() self.channel_num = channel_num self.config = config self.global_config = global_config Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear if is_extra_msa: c_m = channel_num['extra_msa_channel'] else: c_m = channel_num['msa_channel'] self.layer_norm_input = nn.LayerNorm(c_m, name='layer_norm_input') self.left_projection = Linear( c_m, self.config.num_outer_channel, name='left_projection') self.right_projection = Linear( c_m, self.config.num_outer_channel, name='right_projection') if self.global_config.zero_init: init_w = nn.initializer.Constant(value=0.0) else: init_w = nn.initializer.KaimingNormal() self.output_w = paddle.create_parameter( [ self.config.num_outer_channel, self.config.num_outer_channel, channel_num['pair_channel'] ], 'float32', default_initializer=init_w) self.output_b = paddle.create_parameter( [channel_num['pair_channel']], 'float32', default_initializer=nn.initializer.Constant(value=0.0)) def forward(self, act, mask): """Builds OuterProductMean module. Arguments: act: MSA representation, shape [batch, N_seq, N_res, c_m]. mask: MSA mask, shape [batch, N_seq, N_res]. Returns: Update to pair representation, shape [batch, N_res, N_res, c_z]. """ # [B, N_seq, N_res//dap_size, c_m] act = self.layer_norm_input(act) # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq, N_res//dap_size, num_outer_channel] right_act_before = self.right_projection(act) # [B, N_seq, N_res//dap_size, num_outer_channel] => [B, N_seq, N_res, num_outer_channel] right_act = dap.all_gather(right_act_before, axis=2) # [B, N_seq, N_res//dap_size, c_m] => [B, N_seq, N_res//dap_size, num_outer_channel] left_act = self.left_projection(act) # [B, N_seq, N_res] => [B, N_seq, N_res, 1] mask = paddle.unsqueeze(mask, axis=-1) # [B, N_seq, N_res, 1] => [B, N_seq, N_res//dap_size, 1] mask_col = dap.scatter(mask, axis=2) left_act = mask_col * left_act # [B, N_seq, N_res//dap_size, 1], [B, N_seq, N_res, 1] => [B, N_res//dap_size, N_res, 1] epsilon = 1e-3 norm = paddle.einsum('nabc,nadc->nbdc', mask_col, mask) + epsilon def fast_einsum(equation, left_act, right_act): assert equation == "nacb,nade->ndceb" tmp = paddle.matmul( x=paddle.reshape( right_act, [right_act.shape[0], right_act.shape[1], -1]), # na(de) y=paddle.reshape( left_act, [left_act.shape[0], left_act.shape[1], -1]), # na(cb) transpose_x=True, transpose_y=False) # n(de)(cb) tmp = paddle.reshape(tmp, [ left_act.shape[0], right_act.shape[2], right_act.shape[3], left_act.shape[2], left_act.shape[3] ]) out = paddle.transpose(tmp, perm=[0, 1, 3, 2, 4]) return out def compute_chunk(left_act, right_act): # This is equivalent to # # act = jnp.einsum('abc,ade->dceb', left_act, right_act) # act = jnp.einsum('dceb,cef->bdf', act, output_w) + output_b # # but faster. maybe for subbatch inference? # [B, N_seq, N_res//dap_size, num_outer_channel] => [B, N_seq, num_outer_channel, N_res//dap_size] left_act = left_act.transpose([0, 1, 3, 2]) # wait if using async communication and dap, otherwise do nothing right_act_after = dap.all_gather_opp(right_act, axis=2) # [B, N_seq, num_outer_channel, N_res//dap_size], [B, N_seq, N_res, num_outer_channel] # => [B, N_res, num_outer_channel, num_outer_channel, N_res//dap_size] act = fast_einsum('nacb,nade->ndceb', left_act, right_act_after) # [B, N_res, num_outer_channel, num_outer_channel, N_res//dap_size], [num_outer_channel, num_outer_channel, c_z] # => [B, N_res, N_res//dap_size, c_z] act = paddle.einsum('ndceb,cef->ndbf', act, self.output_w) + self.output_b # [B, N_res, N_res//dap_size, c_z] => [B, N_res//dap_size, N_res, c_z] return act.transpose([0, 2, 1, 3]) if not self.training: # low memory mode using subbatch sb_chunk = subbatch(compute_chunk, [0], [2], self.config.chunk_size, 1) act = sb_chunk(left_act, right_act) else: act = compute_chunk(left_act, right_act) act = act / norm return act ================================================ FILE: ppfleetx/models/protein_folding/quat_affine.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Quaternion geometry modules. This introduces a representation of coordinate frames that is based around a ‘QuatAffine’ object. This object describes an array of coordinate frames. It consists of vectors corresponding to the origin of the frames as well as orientations which are stored in two ways, as unit quaternions as well as a rotation matrices. The rotation matrices are derived from the unit quaternions and the two are kept in sync. For an explanation of the relation between unit quaternions and rotations see https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation This representation is used in the model for the backbone frames. One important thing to note here, is that while we update both representations the jit compiler is going to ensure that only the parts that are actually used are executed. """ import paddle import functools import numpy as np from typing import Tuple QUAT_TO_ROT = np.zeros((4, 4, 3, 3), dtype=np.float32) QUAT_TO_ROT[0, 0] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] # rr QUAT_TO_ROT[1, 1] = [[1, 0, 0], [0, -1, 0], [0, 0, -1]] # ii QUAT_TO_ROT[2, 2] = [[-1, 0, 0], [0, 1, 0], [0, 0, -1]] # jj QUAT_TO_ROT[3, 3] = [[-1, 0, 0], [0, -1, 0], [0, 0, 1]] # kk QUAT_TO_ROT[1, 2] = [[0, 2, 0], [2, 0, 0], [0, 0, 0]] # ij QUAT_TO_ROT[1, 3] = [[0, 0, 2], [0, 0, 0], [2, 0, 0]] # ik QUAT_TO_ROT[2, 3] = [[0, 0, 0], [0, 0, 2], [0, 2, 0]] # jk QUAT_TO_ROT[0, 1] = [[0, 0, 0], [0, 0, -2], [0, 2, 0]] # ir QUAT_TO_ROT[0, 2] = [[0, 0, 2], [0, 0, 0], [-2, 0, 0]] # jr QUAT_TO_ROT[0, 3] = [[0, -2, 0], [2, 0, 0], [0, 0, 0]] # kr QUAT_MULTIPLY = np.zeros((4, 4, 4), dtype=np.float32) QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]] QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]] QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]] QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]] QUAT_MULTIPLY_BY_VEC = QUAT_MULTIPLY[:, 1:, :] def rot_to_quat(rot): """Convert rotation matrix to quaternion. Note that this function calls self_adjoint_eig which is extremely expensive on the GPU. If at all possible, this function should run on the CPU. Args: rot: rotation matrix (see below for format). rotation matrix should be shape (..., 3, 3) Returns: Quaternion as (..., 4) tensor. """ rot = [[rot[..., i, j] for j in range(3)] for i in range(3)] [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = rot # pylint: disable=bad-whitespace k = [[ xx + yy + zz, zy - yz, xz - zx, yx - xy, ], [ zy - yz, xx - yy - zz, xy + yx, xz + zx, ], [ xz - zx, xy + yx, yy - xx - zz, yz + zy, ], [ yx - xy, xz + zx, yz + zy, zz - xx - yy, ]] k = (1. / 3.) * paddle.stack( [paddle.stack( x, axis=-1) for x in k], axis=-2) # Get eigenvalues in non-decreasing order and associated. _, qs = paddle.linalg.eigh(k) return qs[..., -1] def quat_to_rot(normalized_quat): """Convert a normalized quaternion to a rotation matrix. Quat (..., 4)""" mat = paddle.unsqueeze(normalized_quat, [-1, -3]) # normalized_quat[..., None, :, None] rot_tensor = paddle.sum( paddle.to_tensor(np.reshape(QUAT_TO_ROT, (4, 4, 9))) * normalized_quat[..., :, None, None] * mat, axis=(-3, -2)) # (..., 4, 4, 9) -> (..., 9) t_shape = rot_tensor.shape[:-1] t_shape.extend([3, 3]) rot = paddle.reshape(rot_tensor, t_shape) # Unstack. (..., 3, 3) return rot def quat_multiply_by_vec(quat, vec): """Multiply a quaternion by a pure-vector quaternion.""" mat = paddle.unsqueeze(vec, [-1, -3]) # vec[..., None, :, None] return paddle.sum(paddle.to_tensor(QUAT_MULTIPLY_BY_VEC) * quat[..., :, None, None] * mat, axis=(-3, -2)) def quat_multiply(quat1, quat2): """Multiply a quaternion by another quaternion.""" mat = paddle.unsqueeze(quat2, [-1, -3]) # quat2[..., None, :, None] return paddle.sum(paddle.to_tensor(QUAT_MULTIPLY) * quat1[..., :, None, None] * mat, axis=(-3, -2)) def apply_rot_to_vec(rot, vec, unstack=False): """Multiply rotation matrix by a vector. vec is a list. Returns: a list of 3 tensors of the points """ if unstack: x, y, z = [vec[..., i] for i in range(3)] else: x, y, z = vec return [ rot[..., 0, 0] * x + rot[..., 0, 1] * y + rot[..., 0, 2] * z, rot[..., 1, 0] * x + rot[..., 1, 1] * y + rot[..., 1, 2] * z, rot[..., 2, 0] * x + rot[..., 2, 1] * y + rot[..., 2, 2] * z ] def apply_rot_to_vec_np(rot, vec, unstack=False): """Multiply rotation matrix by a vector. vec is a list. Returns: a list of 3 tensors of the points """ if unstack: x, y, z = [vec[..., i] for i in range(3)] else: x, y, z = vec return [ rot[0][0] * x + rot[0][1] * y + rot[0][2] * z, rot[1][0] * x + rot[1][1] * y + rot[1][2] * z, rot[2][0] * x + rot[2][1] * y + rot[2][2] * z ] def apply_inverse_rot_to_vec(rot, vec): """Multiply the inverse of a rotation matrix by a vector. vec is a list. Returns: a list of 3 tensors of the points """ # Inverse rotation is just transpose x, y, z = vec return [ rot[..., 0, 0] * x + rot[..., 1, 0] * y + rot[..., 2, 0] * z, rot[..., 0, 1] * x + rot[..., 1, 1] * y + rot[..., 2, 1] * z, rot[..., 0, 2] * x + rot[..., 1, 2] * y + rot[..., 2, 2] * z ] class QuatAffine(object): """Affine transformation represented by quaternion and vector.""" def __init__(self, quaternion: paddle.Tensor, translation: paddle.Tensor, rotation=None, normalize=True): """Initialize from quaternion and translation. Args: quaternion: Rotation represented by a quaternion, to be applied before translation. Must be a unit quaternion unless normalize==True. shape (batch, N_res, 4) translation: Translation represented as a vector. (batch, N_res, 3) rotation: Same rotation as the quaternion, represented as a (batch, N_res, 3, 3) tensor. If None, rotation will be calculated from the quaternion. normalize: If True, l2 normalize the quaternion on input. """ if quaternion is not None: assert quaternion.shape[-1] == 4 if normalize and quaternion is not None: q_length = paddle.norm(quaternion, axis=-1) quaternion = quaternion / q_length[..., None] if rotation is None: rotation = quat_to_rot(quaternion) self.quaternion = quaternion self.rotation = rotation self.translation = translation assert rotation.shape[-1] == 3 and rotation.shape[-2] == 3 assert translation.shape[-1] == 3 def to_tensor(self): return paddle.concat([self.quaternion, self.translation], axis=-1) def stop_rot_gradient(self): """ stop the gradient of rotations """ quat = self.quaternion if not quat is None: quat = quat.detach() return QuatAffine( quaternion=quat, translation=self.translation, rotation=self.rotation.detach(), normalize=False) def scale_translation(self, position_scale): """Return a new quat affine with a different scale for translation.""" return QuatAffine( self.quaternion, position_scale * self.translation, rotation=self.rotation, normalize=False) @classmethod def from_tensor(cls, tensor, normalize=False): assert tensor.shape[-1] == 7 quaternion = tensor[..., 0:4] translation = tensor[..., 4:7] return cls(quaternion, translation, normalize=normalize) def pre_compose(self, update): """Return a new QuatAffine which applies the transformation update first. Args: update: Length-6 vector. 3-vector of x, y, and z such that the quaternion update is (1, x, y, z) and zero for the 3-vector is the identity quaternion. 3-vector for translation concatenated. Returns: New QuatAffine object. """ vector_quaternion_update = update[..., 0:3] trans_update = [update[..., 3], update[..., 4], update[..., 5]] new_quaternion = (self.quaternion + quat_multiply_by_vec( self.quaternion, vector_quaternion_update)) trans_update = apply_rot_to_vec(self.rotation, trans_update) trans_update = paddle.stack(trans_update, axis=-1) new_translation = self.translation + trans_update return QuatAffine(new_quaternion, new_translation) def apply_to_point(self, point, extra_dims=0): """Apply affine to a point. Args: point: List of 3 tensors to apply affine. each with shape [batch_size, num_residues, num_head*num_point_qk] extra_dims: Number of dimensions at the end of the transformed_point shape that are not present in the rotation and translation. The most common use is rotation N points at once with extra_dims=1 for use in a network. Returns: Transformed point after applying affine. """ rotation = self.rotation # [batch_size, num_residues, 3, 3] translation = self.translation # [batch_size, num_residues, 3] for _ in range(extra_dims): translation = paddle.unsqueeze(translation, axis=-2) rotation = paddle.unsqueeze(rotation, axis=-3) rot_point = apply_rot_to_vec(rotation, point) return [ rot_point[0] + translation[..., 0], rot_point[1] + translation[..., 1], rot_point[2] + translation[..., 2] ] def invert_point(self, transformed_point, extra_dims=0): """Apply inverse of transformation to a point. Args: transformed_point: List of 3 tensors to apply affine extra_dims: Number of dimensions at the end of the transformed_point shape that are not present in the rotation and translation. The most common use is rotation N points at once with extra_dims=1 for use in a network. Returns: Transformed point after applying affine. """ rotation = self.rotation translation = self.translation for _ in range(extra_dims): translation = paddle.unsqueeze(translation, axis=-2) rotation = paddle.unsqueeze(rotation, axis=-3) rot_point = [ transformed_point[0] - translation[..., 0], transformed_point[1] - translation[..., 1], transformed_point[2] - translation[..., 2] ] return apply_inverse_rot_to_vec(rotation, rot_point) def invert(self): """Return a new quat affine of the invert transformation.""" pass # TODO ######Paddle Implementation def _multiply(a, b): a1 = a[..., 0, 0] a2 = a[..., 0, 1] a3 = a[..., 0, 2] a11 = a[..., 1, 0] a12 = a[..., 1, 1] a13 = a[..., 1, 2] a21 = a[..., 2, 0] a22 = a[..., 2, 1] a23 = a[..., 2, 2] b1 = b[..., 0, 0] b2 = b[..., 1, 0] b3 = b[..., 0, 1] b11 = b[..., 1, 1] b12 = b[..., 2, 0] b13 = b[..., 0, 2] b21 = b[..., 1, 2] b22 = b[..., 2, 1] b23 = b[..., 2, 2] return paddle.stack( [ paddle.stack( [ a1 * b1 + a2 * b2 + a3 * b12, a1 * b3 + a2 * b11 + a3 * b22, a1 * b13 + a2 * b21 + a3 * b23 ], axis=-1), paddle.stack( [ a11 * b1 + a12 * b2 + a13 * b12, a11 * b3 + a12 * b11 + a13 * b22, a11 * b13 + a12 * b21 + a13 * b23 ], axis=-1), paddle.stack( [ a21 * b1 + a22 * b2 + a23 * b12, a21 * b3 + a22 * b11 + a23 * b22, a21 * b13 + a22 * b21 + a23 * b23 ], axis=-1) ], axis=-2) def make_canonical_transform( n_xyz: paddle.Tensor, ca_xyz: paddle.Tensor, c_xyz: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: """Returns translation and rotation matrices to canonicalize residue atoms. Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You need to take care of such cases in your code. Args: n_xyz: An array of shape [batch, n_res, 3] of nitrogen xyz coordinates. ca_xyz: An array of shape [batch, n_res, 3] of carbon alpha xyz coordinates. c_xyz: An array of shape [batch, n_res, 3] of carbon xyz coordinates. Returns: A tuple (translation, rotation) where: translation is an array of shape [batch, n_res, 3] defining the translation. rotation is an array of shape [batch, n_res, 3, 3] defining the rotation. After applying the translation and rotation to all atoms in a residue: * All atoms will be shifted so that CA is at the origin, * All atoms will be rotated so that C is at the x-axis, * All atoms will be shifted so that N is in the xy plane. """ assert len(n_xyz.shape) == 3, n_xyz.shape assert n_xyz.shape[-1] == 3, n_xyz.shape assert n_xyz.shape == ca_xyz.shape == c_xyz.shape, ( n_xyz.shape, ca_xyz.shape, c_xyz.shape) # Place CA at the origin. translation = -ca_xyz n_xyz = n_xyz + translation c_xyz = c_xyz + translation # Place C on the x-axis. c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)] # Rotate by angle c1 in the x-y plane (around the z-axis). norm = paddle.sqrt(c_x**2 + c_y**2 + 1e-20) sin_c1 = -c_y / norm cos_c1 = c_x / norm zeros = paddle.zeros_like(sin_c1) ones = paddle.ones_like(sin_c1) c1_rot_matrix = paddle.stack( [cos_c1, -sin_c1, zeros, sin_c1, cos_c1, zeros, zeros, zeros, ones], axis=-1) c1_rot_matrix = c1_rot_matrix.reshape(sin_c1.shape + [3, 3]) # Rotate by angle c2 in the x-z plane (around the y-axis). # norm = paddle.sqrt(1e-20 + c_x ** 2 + c_y ** 2 + c_z ** 2) norm = paddle.sqrt(paddle.sum(c_xyz**2, axis=-1)) + 1e-20 sin_c2 = c_z / norm cos_c2 = paddle.sqrt(c_x**2 + c_y**2) / norm c2_rot_matrix = paddle.stack( [cos_c2, zeros, sin_c2, zeros, ones, zeros, -sin_c2, zeros, cos_c2], axis=-1) c2_rot_matrix = c2_rot_matrix.reshape(sin_c2.shape + [3, 3]) c_rot_matrix = _multiply(c2_rot_matrix, c1_rot_matrix) n_xyz = paddle.stack( apply_rot_to_vec( c_rot_matrix, n_xyz, unstack=True), axis=-1) # Place N in the x-y plane. _, n_y, n_z = [n_xyz[..., i] for i in range(3)] # Rotate by angle alpha in the y-z plane (around the x-axis). norm = paddle.sqrt(n_y**2 + n_z**2 + 1e-20) sin_n = -n_z / norm cos_n = n_y / norm n_rot_matrix = paddle.stack( [ones, zeros, zeros, zeros, cos_n, -sin_n, zeros, sin_n, cos_n], axis=-1) n_rot_matrix = n_rot_matrix.reshape(sin_n.shape + [3, 3]) # pylint: enable=bad-whitespace return (translation, _multiply(n_rot_matrix, c_rot_matrix)) def make_transform_from_reference( n_xyz: paddle.Tensor, ca_xyz: paddle.Tensor, c_xyz: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: """Returns rotation and translation matrices to convert from reference. Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You need to take care of such cases in your code. Args: n_xyz: An array of shape [batch, n_res, 3] of nitrogen xyz coordinates. ca_xyz: An array of shape [batch, n_res, 3] of carbon alpha xyz coordinates. c_xyz: An array of shape [batch, n_res, 3] of carbon xyz coordinates. Returns: A tuple (rotation, translation) where: rotation is an array of shape [batch, n_res, 3, 3] defining the rotation. translation is an array of shape [batch, n_res, 3] defining the translation. After applying the translation and rotation to the reference backbone, the coordinates will approximately equal to the input coordinates. The order of translation and rotation differs from make_canonical_transform because the rotation from this function should be applied before the translation, unlike make_canonical_transform. """ translation, rotation = make_canonical_transform(n_xyz, ca_xyz, c_xyz) return paddle.transpose(rotation, (0, 1, 3, 2)), -translation #######Numpy Implementation def _multiply_np(a, b): return np.stack([ np.array([ a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0], a[0][0] * b[0][1] + a[0][1] * b[1][1] + a[0][2] * b[2][1], a[0][0] * b[0][2] + a[0][1] * b[1][2] + a[0][2] * b[2][2] ]), np.array([ a[1][0] * b[0][0] + a[1][1] * b[1][0] + a[1][2] * b[2][0], a[1][0] * b[0][1] + a[1][1] * b[1][1] + a[1][2] * b[2][1], a[1][0] * b[0][2] + a[1][1] * b[1][2] + a[1][2] * b[2][2] ]), np.array([ a[2][0] * b[0][0] + a[2][1] * b[1][0] + a[2][2] * b[2][0], a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1], a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2] ]) ]) def make_canonical_transform_np( n_xyz: np.ndarray, ca_xyz: np.ndarray, c_xyz: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Returns translation and rotation matrices to canonicalize residue atoms. Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You need to take care of such cases in your code. Args: n_xyz: An array of shape [batch, 3] of nitrogen xyz coordinates. ca_xyz: An array of shape [batch, 3] of carbon alpha xyz coordinates. c_xyz: An array of shape [batch, 3] of carbon xyz coordinates. Returns: A tuple (translation, rotation) where: translation is an array of shape [batch, 3] defining the translation. rotation is an array of shape [batch, 3, 3] defining the rotation. After applying the translation and rotation to all atoms in a residue: * All atoms will be shifted so that CA is at the origin, * All atoms will be rotated so that C is at the x-axis, * All atoms will be shifted so that N is in the xy plane. """ assert len(n_xyz.shape) == 2, n_xyz.shape assert n_xyz.shape[-1] == 3, n_xyz.shape assert n_xyz.shape == ca_xyz.shape == c_xyz.shape, ( n_xyz.shape, ca_xyz.shape, c_xyz.shape) # Place CA at the origin. translation = -ca_xyz n_xyz = n_xyz + translation c_xyz = c_xyz + translation # Place C on the x-axis. c_x, c_y, c_z = [c_xyz[:, i] for i in range(3)] # Rotate by angle c1 in the x-y plane (around the z-axis). sin_c1 = -c_y / np.sqrt(1e-20 + c_x**2 + c_y**2) cos_c1 = c_x / np.sqrt(1e-20 + c_x**2 + c_y**2) zeros = np.zeros_like(sin_c1) ones = np.ones_like(sin_c1) # pylint: disable=bad-whitespace c1_rot_matrix = np.stack([ np.array([cos_c1, -sin_c1, zeros]), np.array([sin_c1, cos_c1, zeros]), np.array([zeros, zeros, ones]) ]) # Rotate by angle c2 in the x-z plane (around the y-axis). sin_c2 = c_z / np.sqrt(1e-20 + c_x**2 + c_y**2 + c_z**2) cos_c2 = np.sqrt(c_x**2 + c_y**2) / np.sqrt(1e-20 + c_x**2 + c_y**2 + c_z** 2) c2_rot_matrix = np.stack([ np.array([cos_c2, zeros, sin_c2]), np.array([zeros, ones, zeros]), np.array([-sin_c2, zeros, cos_c2]) ]) c_rot_matrix = _multiply_np(c2_rot_matrix, c1_rot_matrix) n_xyz = np.stack(apply_rot_to_vec_np(c_rot_matrix, n_xyz, unstack=True)).T # Place N in the x-y plane. _, n_y, n_z = [n_xyz[:, i] for i in range(3)] # Rotate by angle alpha in the y-z plane (around the x-axis). sin_n = -n_z / np.sqrt(1e-20 + n_y**2 + n_z**2) cos_n = n_y / np.sqrt(1e-20 + n_y**2 + n_z**2) n_rot_matrix = np.stack([ np.array([ones, zeros, zeros]), np.array([zeros, cos_n, -sin_n]), np.array([zeros, sin_n, cos_n]) ]) return (translation, np.transpose( _multiply_np(n_rot_matrix, c_rot_matrix), [2, 0, 1])) def make_transform_from_reference_np( n_xyz: np.ndarray, ca_xyz: np.ndarray, c_xyz: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Returns rotation and translation matrices to convert from reference. Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You need to take care of such cases in your code. Args: n_xyz: An array of shape [batch, 3] of nitrogen xyz coordinates. ca_xyz: An array of shape [batch, 3] of carbon alpha xyz coordinates. c_xyz: An array of shape [batch, 3] of carbon xyz coordinates. Returns: A tuple (rotation, translation) where: rotation is an array of shape [batch, 3, 3] defining the rotation. translation is an array of shape [batch, 3] defining the translation. After applying the translation and rotation to the reference backbone, the coordinates will approximately equal to the input coordinates. The order of translation and rotation differs from make_canonical_transform because the rotation from this function should be applied before the translation, unlike make_canonical_transform. """ translation, rotation = make_canonical_transform_np(n_xyz, ca_xyz, c_xyz) return np.transpose(rotation, (0, 2, 1)), -translation ================================================ FILE: ppfleetx/models/protein_folding/r3.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Transformations for 3D coordinates. This Module contains objects for representing Vectors (Vecs), Rotation Matrices (Rots) and proper Rigid transformation (Rigids). These are represented as named tuples with arrays for each entry, for example a set of [N, M] points would be represented as a Vecs object with arrays of shape [N, M] for x, y and z. This is being done to improve readability by making it very clear what objects are geometric objects rather than relying on comments and array shapes. Another reason for this is to avoid using matrix multiplication primitives like matmul or einsum, on modern accelerator hardware these can end up on specialized cores such as tensor cores on GPU or the MXU on cloud TPUs, this often involves lower computational precision which can be problematic for coordinate geometry. Also these cores are typically optimized for larger matrices than 3 dimensional, this code is written to avoid any unintended use of these cores on both GPUs and TPUs. """ import paddle import numpy as np import collections from typing import List from . import (quat_affine, ) # Array of rigid 3D transformations, stored as array of rotations and # array of translations. Rigids = collections.namedtuple('Rigids', ['rot', 'trans']) class Vecs: def __init__(self, *args): if len(args) == 1: if type(args[0]) in [list, tuple] and len(args[0]) == 3: self.translation = paddle.stack(args[0], axis=-1) elif len(args[0]) == 1: self.translation = args[0] elif args[0].shape[-1] == 3: self.translation = args[0] else: raise ValueError('Invalid number of inputs') elif len(args) == 3: self.translation = paddle.stack(args, axis=-1) else: raise ValueError('Invalid number of inputs') def map(self, map_fn, *args): result = [] for i in range(3): r = map_fn(self.translation[..., i], *args) result.append(r) if result[0].shape[-1] == 1: return Vecs(paddle.concat(result, axis=-1)) else: return Vecs(paddle.stack(result, axis=-1)) @property def shape(self): return self.translation.shape @property def x(self): return self.translation[..., 0] @property def y(self): return self.translation[..., 1] @property def z(self): return self.translation[..., 2] def __getitem__(self, index): return Vecs(self.translation[index]) def __str__(self): return str(self.translation.shape) def __repr__(self): return str(self.translation.shape) def reshape(self, *argv): return self.translation.reshape(*argv) class Rots: def __init__(self, *args): if len(args) == 1: args = args[0] if len(args) == 9: rots = paddle.stack(args, axis=-1) self.rotation = rots.reshape(rots.shape[:-1] + [3, 3]) else: if args.shape[-1] == 3 and args.shape[-2] == 3: self.rotation = args elif args.shape[-1] == 9: self.rotation = args.reshape(args.shape[:-1] + [3, 3]) else: raise ValueError('Invalid shape of input') elif len(args) == 9: rots = paddle.stack(args, axis=-1) self.rotation = rots.reshape(rots.shape[:-1] + [3, 3]) else: raise ValueError('Invalid number of inputs') def map(self, map_fn, *args): result_i = [] for i in range(3): result_j = [] for j in range(3): r = map_fn(self.rotation[..., i, j], *args) result_j.append(r) if result_j[0].shape[-1] == 1: result_i.append(paddle.concat(result_j, axis=-1)) else: result_i.append(paddle.stack(result_j, axis=-1)) return Rots(paddle.stack(result_i, axis=-2)) @property def shape(self): return self.rotation.shape @property def xx(self): return self.rotation[..., 0, 0] @property def xy(self): return self.rotation[..., 0, 1] @property def xz(self): return self.rotation[..., 0, 2] @property def yx(self): return self.rotation[..., 1, 0] @property def yy(self): return self.rotation[..., 1, 1] @property def yz(self): return self.rotation[..., 1, 2] @property def zx(self): return self.rotation[..., 2, 0] @property def zy(self): return self.rotation[..., 2, 1] @property def zz(self): return self.rotation[..., 2, 2] def __getitem__(self, index): return Rots(self.rotation[index]) def __str__(self): return str(self.rotation.shape) def __repr__(self): return str(self.rotation.shape) def reshape(self, *argv): return self.rotation.reshape(*argv) def squared_difference(x, y): return paddle.square(x - y) def invert_rigids(r: Rigids) -> Rigids: """Computes group inverse of rigid transformations 'r'.""" inv_rots = invert_rots(r.rot) t = rots_mul_vecs(inv_rots, r.trans) inv_trans = Vecs(-1 * t.translation) return Rigids(inv_rots, inv_trans) def invert_rots(m: Rots) -> Rots: """Computes inverse of rotations 'm'.""" return Rots(m.xx, m.yx, m.zx, m.xy, m.yy, m.zy, m.xz, m.yz, m.zz) def rigids_from_3_points_vecs( point_on_neg_x_axis: Vecs, origin: Vecs, point_on_xy_plane: Vecs, ) -> Rigids: """Create Rigids from 3 points. Jumper et al. (2021) Suppl. Alg. 21 "rigidFrom3Points" This creates a set of rigid transformations from 3 points by Gram Schmidt orthogonalization. Args: point_on_neg_x_axis: Vecs corresponding to points on the negative x axis origin: Origin of resulting rigid transformations point_on_xy_plane: Vecs corresponding to points in the xy plane Returns: Rigid transformations from global frame to local frames derived from the input points. """ m = rots_from_two_vecs( e0_unnormalized=vecs_sub(origin, point_on_neg_x_axis), e1_unnormalized=vecs_sub(point_on_xy_plane, origin)) return Rigids(rot=m, trans=origin) def rigids_from_3_points(point_on_neg_x_axis: paddle.Tensor, origin: paddle.Tensor, point_on_xy_plane: paddle.Tensor, eps: float=1e-8) -> Rigids: """Create Rigids from 3 points. Jumper et al. (2021) Suppl. Alg. 21 "rigidFrom3Points" This creates a set of rigid transformations from 3 points by Gram Schmidt orthogonalization. Argss: point_on_neg_x_axis: [*, 3] coordinates origin: [*, 3] coordinates point_on_xy_plane: [*, 3] coordinates eps: small regularizer added to squared norm before taking square root. Returns: Rigids corresponding to transformations from global frame to local frames derived from the input points. """ point_on_neg_x_axis = paddle.unbind(point_on_neg_x_axis, axis=-1) origin = paddle.unbind(origin, axis=-1) point_on_xy_plane = paddle.unbind(point_on_xy_plane, axis=-1) e0 = [c1 - c2 for c1, c2 in zip(origin, point_on_neg_x_axis)] e1 = [c1 - c2 for c1, c2 in zip(point_on_xy_plane, origin)] norms = paddle.sqrt( paddle.square(e0[0]) + paddle.square(e0[1]) + paddle.square(e0[2]) + eps) e0 = [c / norms for c in e0] dot = sum((c1 * c2 for c1, c2 in zip(e0, e1))) e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)] norms = paddle.sqrt( paddle.square(e1[0]) + paddle.square(e1[1]) + paddle.square(e1[2]) + eps) e1 = [c / norms for c in e1] e2 = [ e0[1] * e1[2] - e0[2] * e1[1], e0[2] * e1[0] - e0[0] * e1[2], e0[0] * e1[1] - e0[1] * e1[0], ] rots = paddle.stack([c for tup in zip(e0, e1, e2) for c in tup], axis=-1) return Rigids(Rots(rots), Vecs(origin)) def rigids_from_list(l: List[paddle.Tensor]) -> Rigids: """Converts flat list of arrays to rigid transformations.""" assert len(l) == 12 return Rigids(Rots(*(l[:9])), Vecs(*(l[9:]))) def rigids_from_quataffine(a: quat_affine.QuatAffine) -> Rigids: """Converts QuatAffine object to the corresponding Rigids object.""" return Rigids(Rots(a.rotation), Vecs(a.translation)) def rigids_from_tensor4x4(m: paddle.Tensor) -> Rigids: """Construct Rigids from an 4x4 array. Here the 4x4 is representing the transformation in homogeneous coordinates. Argss: m: [*, 4, 4] homogenous transformation tensor Returns: Rigids corresponding to transformations m """ assert m.shape[-1] == 4 assert m.shape[-2] == 4 sliced_m = m[..., 0:3, :] # shape is [..., 3, 4] outs = paddle.split(sliced_m, num_or_sections=[3, 1], axis=-1) return Rigids(Rots(outs[0]), Vecs(outs[1].squeeze_(axis=-1))) def rigids_from_tensor_flat9(m: paddle.Tensor) -> Rigids: """Flat9 encoding: first two columns of rotation matrix + translation.""" assert m.shape[-1] == 9 e0 = Vecs(m[..., 0], m[..., 1], m[..., 2]) e1 = Vecs(m[..., 3], m[..., 4], m[..., 5]) trans = Vecs(m[..., 6], m[..., 7], m[..., 8]) return Rigids(rot=rots_from_two_vecs(e0, e1), trans=trans) def rigids_from_tensor_flat12(m: paddle.Tensor # shape (..., 12) ) -> Rigids: # shape (...) """Flat12 encoding: rotation matrix (9 floats) + translation (3 floats).""" assert m.shape[-1] == 12 return Rigids(Rots(m[..., :9]), Vecs(m[..., 9:])) def rigids_mul_rigids(a: Rigids, b: Rigids) -> Rigids: """Group composition of Rigids 'a' and 'b'.""" return Rigids( rots_mul_rots(a.rot, b.rot), vecs_add(a.trans, rots_mul_vecs(a.rot, b.trans))) def rigids_mul_rots(r: Rigids, m: Rots) -> Rigids: """Compose rigid transformations 'r' with rotations 'm'.""" return Rigids(rots_mul_rots(r.rot, m), r.trans) def rigids_mul_vecs(r: Rigids, v: Vecs) -> Vecs: """Apply rigid transforms 'r' to points 'v'.""" return vecs_add(rots_mul_vecs(r.rot, v), r.trans) def rigids_to_list(r: Rigids) -> List[paddle.Tensor]: """Turn Rigids into flat list, inverse of 'rigids_from_list'.""" return list(r.rot) + list(r.trans) def rigids_to_quataffine(r: Rigids) -> quat_affine.QuatAffine: """Convert Rigids r into QuatAffine, inverse of 'rigids_from_quataffine'.""" return quat_affine.QuatAffine( quaternion=None, rotation=r.rot.rotation, translation=r.trans.translation) def rigids_to_tensor_flat9(r: Rigids) -> paddle.Tensor: # shape (..., 9) """Flat9 encoding: first two columns of rotation matrix + translation.""" return paddle.stack( [r.rot.xx, r.rot.yx, r.rot.zx, r.rot.xy, r.rot.yy, r.rot.zy] + list(r.trans), axis=-1) def rigids_to_tensor_flat12(r: Rigids # shape (...) ) -> paddle.Tensor: # shape (..., 12) """Flat12 encoding: rotation matrix (9 floats) + translation (3 floats).""" return paddle.stack( [ r.rot.xx, r.rot.yx, r.rot.zx, r.rot.xy, r.rot.yy, r.rot.zy, r.rot.xz, r.rot.yz, r.rot.zz ] + [r.trans.x, r.trans.y, r.trans.z], axis=-1) def rots_from_tensor3x3( m: paddle.Tensor, # shape (..., 3, 3) ) -> Rots: # shape (...) """Convert rotations represented as (3, 3) array to Rots.""" assert m.shape[-1] == 3 assert m.shape[-2] == 3 return Rots(m) def rots_from_two_vecs(e0_unnormalized: Vecs, e1_unnormalized: Vecs) -> Rots: """Create rotation matrices from unnormalized vectors for the x and y-axes. This creates a rotation matrix from two vectors using Gram-Schmidt orthogonalization. Args: e0_unnormalized: vectors lying along x-axis of resulting rotation e1_unnormalized: vectors lying in xy-plane of resulting rotation Returns: Rotations resulting from Gram-Schmidt procedure. """ # Normalize the unit vector for the x-axis, e0. e0 = vecs_robust_normalize(e0_unnormalized) # make e1 perpendicular to e0. c = vecs_dot_vecs(e1_unnormalized, e0) e1 = Vecs(e1_unnormalized.translation - c.unsqueeze_(axis=-1) * e0.translation) e1 = vecs_robust_normalize(e1) # Compute e2 as cross product of e0 and e1. e2 = vecs_cross_vecs(e0, e1) return Rots( paddle.stack( [e0.translation, e1.translation, e2.translation], axis=-1)) def broadcast_shape(x_shape, y_shape): if x_shape == y_shape or len(x_shape) > len(y_shape): out_shape = x_shape elif len(y_shape) > len(x_shape): out_shape = y_shape else: out_shape = [] for i in range(len(x_shape)): if x_shape[i] == y_shape[i] or y_shape[i] == 1: out_shape.append(x_shape[i]) elif x_shape[i] == 1: out_shape.append(y_shape[i]) else: raise ValueError("{} and {} cannot braodcast.".format(x_shape, y_shape)) return out_shape def broadcast_to(x, broadcast_shape): if x.shape == broadcast_shape: return x else: return paddle.broadcast_to(x, broadcast_shape) def rots_mul_rots(a: Rots, b: Rots) -> Rots: """Composition of rotations 'a' and 'b'.""" out_shape = broadcast_shape(a.shape, b.shape) broadcasted_a = broadcast_to(a.rotation, out_shape) broadcasted_b = broadcast_to(b.rotation, out_shape) return Rots(paddle.matmul(broadcasted_a, broadcasted_b)) def rots_mul_vecs(m: Rots, v: Vecs) -> Vecs: """Apply rotations 'm' to vectors 'v'.""" if m.shape[:-2] == v.shape[:-1]: broadcasted_m = m.rotation broadcasted_v = v.translation else: out_shape = broadcast_shape(m.shape[:-2], v.shape[:-1]) broadcasted_m = broadcast_to(m.rotation, out_shape + [3, 3]) broadcasted_v = broadcast_to(v.translation, out_shape + [3]) return Vecs( paddle.matmul( broadcasted_m, broadcasted_v.unsqueeze(axis=-1)).squeeze_(axis=-1)) def vecs_add(v1: Vecs, v2: Vecs) -> Vecs: """Add two vectors 'v1' and 'v2'.""" return Vecs(v1.translation + v2.translation) def vecs_dot_vecs(v1: Vecs, v2: Vecs) -> paddle.Tensor: """Dot product of vectors 'v1' and 'v2'.""" return v1.x * v2.x + v1.y * v2.y + v1.z * v2.z def vecs_cross_vecs(v1: Vecs, v2: Vecs) -> Vecs: """Cross product of vectors 'v1' and 'v2'.""" return Vecs(paddle.cross(v1.translation, v2.translation, axis=-1)) def vecs_from_tensor(x: paddle.Tensor # shape (..., 3) ) -> Vecs: # shape (...) """Converts from tensor of shape (3,) to Vecs.""" assert x.shape[-1] == 3 return Vecs(x) def vecs_robust_normalize(v: Vecs, epsilon: float=1e-8) -> Vecs: """Normalizes vectors 'v'. Argss: v: vectors to be normalized. epsilon: small regularizer added to squared norm before taking square root. Returns: normalized vectors """ norms = vecs_robust_norm(v, epsilon) return Vecs(v.translation / norms.unsqueeze_(axis=-1)) def vecs_robust_norm(v: Vecs, epsilon: float=1e-8) -> paddle.Tensor: """Computes norm of vectors 'v'. Args: v: vectors to be normalized. epsilon: small regularizer added to squared norm before taking square root. Returns: norm of 'v' """ return paddle.sqrt( paddle.square(v.x) + paddle.square(v.y) + paddle.square(v.z) + epsilon) def vecs_sub(v1: Vecs, v2: Vecs) -> Vecs: """Computes v1 - v2.""" return Vecs(v1.translation - v2.translation) def vecs_squared_distance(v1: Vecs, v2: Vecs) -> paddle.Tensor: """Computes squared euclidean difference between 'v1' and 'v2'.""" return (squared_difference(v1.x, v2.x) + squared_difference(v1.y, v2.y) + squared_difference(v1.z, v2.z)) def vecs_to_tensor(v: Vecs # shape (...) ) -> paddle.Tensor: # shape(..., 3) """Converts 'v' to tensor with shape 3, inverse of 'vecs_from_tensor'.""" return v.translation ================================================ FILE: ppfleetx/models/protein_folding/residue_constants.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Constants used in AlphaFold.""" import collections import functools import os from typing import List, Mapping, Tuple import numpy as np import tree # Internal import (35fd). # Distance from one CA to next CA [trans configuration: omega = 180]. ca_ca = 3.80209737096 # Format: The list for each AA type contains chi1, chi2, chi3, chi4 in # this order (or a relevant subset from chi1 onwards). ALA and GLY don't have # chi angles so their chi angle lists are empty. chi_angles_atoms = { 'ALA': [], # Chi5 in arginine is always 0 +- 5 degrees, so ignore it. 'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'], ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']], 'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']], 'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']], 'CYS': [['N', 'CA', 'CB', 'SG']], 'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'], ['CB', 'CG', 'CD', 'OE1']], 'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'], ['CB', 'CG', 'CD', 'OE1']], 'GLY': [], 'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']], 'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']], 'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']], 'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'], ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']], 'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'], ['CB', 'CG', 'SD', 'CE']], 'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']], 'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']], 'SER': [['N', 'CA', 'CB', 'OG']], 'THR': [['N', 'CA', 'CB', 'OG1']], 'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']], 'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']], 'VAL': [['N', 'CA', 'CB', 'CG1']], } # If chi angles given in fixed-length array, this matrix determines how to mask # them for each AA type. The order is as per restype_order (see below). chi_angles_mask = [ [0.0, 0.0, 0.0, 0.0], # ALA [1.0, 1.0, 1.0, 1.0], # ARG [1.0, 1.0, 0.0, 0.0], # ASN [1.0, 1.0, 0.0, 0.0], # ASP [1.0, 0.0, 0.0, 0.0], # CYS [1.0, 1.0, 1.0, 0.0], # GLN [1.0, 1.0, 1.0, 0.0], # GLU [0.0, 0.0, 0.0, 0.0], # GLY [1.0, 1.0, 0.0, 0.0], # HIS [1.0, 1.0, 0.0, 0.0], # ILE [1.0, 1.0, 0.0, 0.0], # LEU [1.0, 1.0, 1.0, 1.0], # LYS [1.0, 1.0, 1.0, 0.0], # MET [1.0, 1.0, 0.0, 0.0], # PHE [1.0, 1.0, 0.0, 0.0], # PRO [1.0, 0.0, 0.0, 0.0], # SER [1.0, 0.0, 0.0, 0.0], # THR [1.0, 1.0, 0.0, 0.0], # TRP [1.0, 1.0, 0.0, 0.0], # TYR [1.0, 0.0, 0.0, 0.0], # VAL ] # The following chi angles are pi periodic: they can be rotated by a multiple # of pi without affecting the structure. chi_pi_periodic = [ [0.0, 0.0, 0.0, 0.0], # ALA [0.0, 0.0, 0.0, 0.0], # ARG [0.0, 0.0, 0.0, 0.0], # ASN [0.0, 1.0, 0.0, 0.0], # ASP [0.0, 0.0, 0.0, 0.0], # CYS [0.0, 0.0, 0.0, 0.0], # GLN [0.0, 0.0, 1.0, 0.0], # GLU [0.0, 0.0, 0.0, 0.0], # GLY [0.0, 0.0, 0.0, 0.0], # HIS [0.0, 0.0, 0.0, 0.0], # ILE [0.0, 0.0, 0.0, 0.0], # LEU [0.0, 0.0, 0.0, 0.0], # LYS [0.0, 0.0, 0.0, 0.0], # MET [0.0, 1.0, 0.0, 0.0], # PHE [0.0, 0.0, 0.0, 0.0], # PRO [0.0, 0.0, 0.0, 0.0], # SER [0.0, 0.0, 0.0, 0.0], # THR [0.0, 0.0, 0.0, 0.0], # TRP [0.0, 1.0, 0.0, 0.0], # TYR [0.0, 0.0, 0.0, 0.0], # VAL [0.0, 0.0, 0.0, 0.0], # UNK ] # Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi, # psi and chi angles: # 0: 'backbone group', # 1: 'pre-omega-group', (empty) # 2: 'phi-group', (currently empty, because it defines only hydrogens) # 3: 'psi-group', # 4,5,6,7: 'chi1,2,3,4-group' # The atom positions are relative to the axis-end-atom of the corresponding # rotation axis. The x-axis is in direction of the rotation axis, and the y-axis # is defined such that the dihedral-angle-definiting atom (the last entry in # chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate). # format: [atomname, group_idx, rel_position] rigid_group_atom_positions = { 'ALA': [ ['N', 0, (-0.525, 1.363, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.526, -0.000, -0.000)], ['CB', 0, (-0.529, -0.774, -1.205)], ['O', 3, (0.627, 1.062, 0.000)], ], 'ARG': [ ['N', 0, (-0.524, 1.362, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.525, -0.000, -0.000)], ['CB', 0, (-0.524, -0.778, -1.209)], ['O', 3, (0.626, 1.062, 0.000)], ['CG', 4, (0.616, 1.390, -0.000)], ['CD', 5, (0.564, 1.414, 0.000)], ['NE', 6, (0.539, 1.357, -0.000)], ['NH1', 7, (0.206, 2.301, 0.000)], ['NH2', 7, (2.078, 0.978, -0.000)], ['CZ', 7, (0.758, 1.093, -0.000)], ], 'ASN': [ ['N', 0, (-0.536, 1.357, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.526, -0.000, -0.000)], ['CB', 0, (-0.531, -0.787, -1.200)], ['O', 3, (0.625, 1.062, 0.000)], ['CG', 4, (0.584, 1.399, 0.000)], ['ND2', 5, (0.593, -1.188, 0.001)], ['OD1', 5, (0.633, 1.059, 0.000)], ], 'ASP': [ ['N', 0, (-0.525, 1.362, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.527, 0.000, -0.000)], ['CB', 0, (-0.526, -0.778, -1.208)], ['O', 3, (0.626, 1.062, -0.000)], ['CG', 4, (0.593, 1.398, -0.000)], ['OD1', 5, (0.610, 1.091, 0.000)], ['OD2', 5, (0.592, -1.101, -0.003)], ], 'CYS': [ ['N', 0, (-0.522, 1.362, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.524, 0.000, 0.000)], ['CB', 0, (-0.519, -0.773, -1.212)], ['O', 3, (0.625, 1.062, -0.000)], ['SG', 4, (0.728, 1.653, 0.000)], ], 'GLN': [ ['N', 0, (-0.526, 1.361, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.526, 0.000, 0.000)], ['CB', 0, (-0.525, -0.779, -1.207)], ['O', 3, (0.626, 1.062, -0.000)], ['CG', 4, (0.615, 1.393, 0.000)], ['CD', 5, (0.587, 1.399, -0.000)], ['NE2', 6, (0.593, -1.189, -0.001)], ['OE1', 6, (0.634, 1.060, 0.000)], ], 'GLU': [ ['N', 0, (-0.528, 1.361, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.526, -0.000, -0.000)], ['CB', 0, (-0.526, -0.781, -1.207)], ['O', 3, (0.626, 1.062, 0.000)], ['CG', 4, (0.615, 1.392, 0.000)], ['CD', 5, (0.600, 1.397, 0.000)], ['OE1', 6, (0.607, 1.095, -0.000)], ['OE2', 6, (0.589, -1.104, -0.001)], ], 'GLY': [ ['N', 0, (-0.572, 1.337, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.517, -0.000, -0.000)], ['O', 3, (0.626, 1.062, -0.000)], ], 'HIS': [ ['N', 0, (-0.527, 1.360, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.525, 0.000, 0.000)], ['CB', 0, (-0.525, -0.778, -1.208)], ['O', 3, (0.625, 1.063, 0.000)], ['CG', 4, (0.600, 1.370, -0.000)], ['CD2', 5, (0.889, -1.021, 0.003)], ['ND1', 5, (0.744, 1.160, -0.000)], ['CE1', 5, (2.030, 0.851, 0.002)], ['NE2', 5, (2.145, -0.466, 0.004)], ], 'ILE': [ ['N', 0, (-0.493, 1.373, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.527, -0.000, -0.000)], ['CB', 0, (-0.536, -0.793, -1.213)], ['O', 3, (0.627, 1.062, -0.000)], ['CG1', 4, (0.534, 1.437, -0.000)], ['CG2', 4, (0.540, -0.785, -1.199)], ['CD1', 5, (0.619, 1.391, 0.000)], ], 'LEU': [ ['N', 0, (-0.520, 1.363, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.525, -0.000, -0.000)], ['CB', 0, (-0.522, -0.773, -1.214)], ['O', 3, (0.625, 1.063, -0.000)], ['CG', 4, (0.678, 1.371, 0.000)], ['CD1', 5, (0.530, 1.430, -0.000)], ['CD2', 5, (0.535, -0.774, 1.200)], ], 'LYS': [ ['N', 0, (-0.526, 1.362, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.526, 0.000, 0.000)], ['CB', 0, (-0.524, -0.778, -1.208)], ['O', 3, (0.626, 1.062, -0.000)], ['CG', 4, (0.619, 1.390, 0.000)], ['CD', 5, (0.559, 1.417, 0.000)], ['CE', 6, (0.560, 1.416, 0.000)], ['NZ', 7, (0.554, 1.387, 0.000)], ], 'MET': [ ['N', 0, (-0.521, 1.364, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.525, 0.000, 0.000)], ['CB', 0, (-0.523, -0.776, -1.210)], ['O', 3, (0.625, 1.062, -0.000)], ['CG', 4, (0.613, 1.391, -0.000)], ['SD', 5, (0.703, 1.695, 0.000)], ['CE', 6, (0.320, 1.786, -0.000)], ], 'PHE': [ ['N', 0, (-0.518, 1.363, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.524, 0.000, -0.000)], ['CB', 0, (-0.525, -0.776, -1.212)], ['O', 3, (0.626, 1.062, -0.000)], ['CG', 4, (0.607, 1.377, 0.000)], ['CD1', 5, (0.709, 1.195, -0.000)], ['CD2', 5, (0.706, -1.196, 0.000)], ['CE1', 5, (2.102, 1.198, -0.000)], ['CE2', 5, (2.098, -1.201, -0.000)], ['CZ', 5, (2.794, -0.003, -0.001)], ], 'PRO': [ ['N', 0, (-0.566, 1.351, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.527, -0.000, 0.000)], ['CB', 0, (-0.546, -0.611, -1.293)], ['O', 3, (0.621, 1.066, 0.000)], ['CG', 4, (0.382, 1.445, 0.0)], # ['CD', 5, (0.427, 1.440, 0.0)], ['CD', 5, (0.477, 1.424, 0.0)], # manually made angle 2 degrees larger ], 'SER': [ ['N', 0, (-0.529, 1.360, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.525, -0.000, -0.000)], ['CB', 0, (-0.518, -0.777, -1.211)], ['O', 3, (0.626, 1.062, -0.000)], ['OG', 4, (0.503, 1.325, 0.000)], ], 'THR': [ ['N', 0, (-0.517, 1.364, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.526, 0.000, -0.000)], ['CB', 0, (-0.516, -0.793, -1.215)], ['O', 3, (0.626, 1.062, 0.000)], ['CG2', 4, (0.550, -0.718, -1.228)], ['OG1', 4, (0.472, 1.353, 0.000)], ], 'TRP': [ ['N', 0, (-0.521, 1.363, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.525, -0.000, 0.000)], ['CB', 0, (-0.523, -0.776, -1.212)], ['O', 3, (0.627, 1.062, 0.000)], ['CG', 4, (0.609, 1.370, -0.000)], ['CD1', 5, (0.824, 1.091, 0.000)], ['CD2', 5, (0.854, -1.148, -0.005)], ['CE2', 5, (2.186, -0.678, -0.007)], ['CE3', 5, (0.622, -2.530, -0.007)], ['NE1', 5, (2.140, 0.690, -0.004)], ['CH2', 5, (3.028, -2.890, -0.013)], ['CZ2', 5, (3.283, -1.543, -0.011)], ['CZ3', 5, (1.715, -3.389, -0.011)], ], 'TYR': [ ['N', 0, (-0.522, 1.362, 0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.524, -0.000, -0.000)], ['CB', 0, (-0.522, -0.776, -1.213)], ['O', 3, (0.627, 1.062, -0.000)], ['CG', 4, (0.607, 1.382, -0.000)], ['CD1', 5, (0.716, 1.195, -0.000)], ['CD2', 5, (0.713, -1.194, -0.001)], ['CE1', 5, (2.107, 1.200, -0.002)], ['CE2', 5, (2.104, -1.201, -0.003)], ['OH', 5, (4.168, -0.002, -0.005)], ['CZ', 5, (2.791, -0.001, -0.003)], ], 'VAL': [ ['N', 0, (-0.494, 1.373, -0.000)], ['CA', 0, (0.000, 0.000, 0.000)], ['C', 0, (1.527, -0.000, -0.000)], ['CB', 0, (-0.533, -0.795, -1.213)], ['O', 3, (0.627, 1.062, -0.000)], ['CG1', 4, (0.540, 1.429, -0.000)], ['CG2', 4, (0.533, -0.776, 1.203)], ], } # A list of atoms (excluding hydrogen) for each AA type. PDB naming convention. residue_atoms = { 'ALA': ['C', 'CA', 'CB', 'N', 'O'], 'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'], 'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'], 'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'], 'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'], 'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'], 'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'], 'GLY': ['C', 'CA', 'N', 'O'], 'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'], 'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'], 'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'], 'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'], 'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'], 'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'], 'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'], 'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'], 'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'], 'TRP': [ 'C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2', 'N', 'NE1', 'O' ], 'TYR': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O', 'OH'], 'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O'] } # Naming swaps for ambiguous atom names. # Due to symmetries in the amino acids the naming of atoms is ambiguous in # 4 of the 20 amino acids. # (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities # in LEU, VAL and ARG can be resolved by using the 3d constellations of # the 'ambiguous' atoms and their neighbours) residue_atom_renaming_swaps = { 'ASP': { 'OD1': 'OD2' }, 'GLU': { 'OE1': 'OE2' }, 'PHE': { 'CD1': 'CD2', 'CE1': 'CE2' }, 'TYR': { 'CD1': 'CD2', 'CE1': 'CE2' }, } # Van der Waals radii [Angstroem] of the atoms (from Wikipedia) van_der_waals_radius = { 'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8, } Bond = collections.namedtuple( 'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev']) BondAngle = collections.namedtuple( 'BondAngle', ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev']) @functools.lru_cache(maxsize=None) def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]], Mapping[ str, List[Bond]], Mapping[str, List[BondAngle]]]: """Load stereo_chemical_props.txt into a nice structure. Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite edge of the triangle ("residue_virtual_bonds"). Returns: residue_bonds: Dict that maps resname -> list of Bond tuples. residue_virtual_bonds: Dict that maps resname -> list of Bond tuples. residue_bond_angles: Dict that maps resname -> list of BondAngle tuples. """ stereo_chemical_props_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'stereo_chemical_props.txt') with open(stereo_chemical_props_path, 'rt') as f: stereo_chemical_props = f.read() lines_iter = iter(stereo_chemical_props.splitlines()) # Load bond lengths. residue_bonds = {} next(lines_iter) # Skip header line. for line in lines_iter: if line.strip() == '-': break bond, resname, length, stddev = line.split() atom1, atom2 = bond.split('-') if resname not in residue_bonds: residue_bonds[resname] = [] residue_bonds[resname].append( Bond(atom1, atom2, float(length), float(stddev))) residue_bonds['UNK'] = [] # Load bond angles. residue_bond_angles = {} next(lines_iter) # Skip empty line. next(lines_iter) # Skip header line. for line in lines_iter: if line.strip() == '-': break bond, resname, angle_degree, stddev_degree = line.split() atom1, atom2, atom3 = bond.split('-') if resname not in residue_bond_angles: residue_bond_angles[resname] = [] residue_bond_angles[resname].append( BondAngle(atom1, atom2, atom3, float(angle_degree) / 180. * np.pi, float(stddev_degree) / 180. * np.pi)) residue_bond_angles['UNK'] = [] def make_bond_key(atom1_name, atom2_name): """Unique key to lookup bonds.""" return '-'.join(sorted([atom1_name, atom2_name])) # Translate bond angles into distances ("virtual bonds"). residue_virtual_bonds = {} for resname, bond_angles in residue_bond_angles.items(): # Create a fast lookup dict for bond lengths. bond_cache = {} for b in residue_bonds[resname]: bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b residue_virtual_bonds[resname] = [] for ba in bond_angles: bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)] bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)] # Compute distance between atom1 and atom3 using the law of cosines # c^2 = a^2 + b^2 - 2ab*cos(gamma). gamma = ba.angle_rad length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 * bond1.length * bond2.length * np.cos(gamma)) # Propagation of uncertainty assuming uncorrelated errors. dl_outer = 0.5 / length dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer dl_db1 = ( 2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer dl_db2 = ( 2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer stddev = np.sqrt((dl_dgamma * ba.stddev)**2 + ( dl_db1 * bond1.stddev)**2 + (dl_db2 * bond2.stddev)**2) residue_virtual_bonds[resname].append( Bond(ba.atom1_name, ba.atom3name, length, stddev)) return (residue_bonds, residue_virtual_bonds, residue_bond_angles) # Between-residue bond lengths for general bonds (first element) and for Proline # (second element). between_res_bond_length_c_n = [1.329, 1.341] between_res_bond_length_stddev_c_n = [0.014, 0.016] # Between-residue cos_angles. between_res_cos_angles_c_n_ca = [-0.5203, 0.0353] # degrees: 121.352 +- 2.315 between_res_cos_angles_ca_c_n = [-0.4473, 0.0311] # degrees: 116.568 +- 1.995 # This mapping is used when we need to store atom data in a format that requires # fixed atom data size for every residue (e.g. a numpy array). atom_types = [ 'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD', 'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3', 'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2', 'CZ3', 'NZ', 'OXT' ] atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)} atom_type_num = len(atom_types) # := 37. # A compact atom encoding with 14 columns # pylint: disable=line-too-long # pylint: disable=bad-whitespace restype_name_to_atom14_names = { 'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''], 'ARG': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2', '', '', '' ], 'ASN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''], 'ASP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''], 'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''], 'GLN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''], 'GLU': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', '' ], 'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''], 'HIS': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2', '', '', '', '' ], 'ILE': [ 'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', '' ], 'LEU': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', '' ], 'LYS': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', '' ], 'MET': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', '' ], 'PHE': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', '', '', '' ], 'PRO': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', '' ], 'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''], 'THR': [ 'N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', '' ], 'TRP': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2' ], 'TYR': [ 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH', '', '' ], 'VAL': [ 'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', '' ], 'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''], } # pylint: enable=line-too-long # pylint: enable=bad-whitespace # This is the standard residue order when coding AA type as a number. # Reproduce it by taking 3-letter AA codes and sorting them alphabetically. restypes = [ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' ] restype_order = {restype: i for i, restype in enumerate(restypes)} restype_num = len(restypes) # := 20. unk_restype_index = restype_num # Catch-all index for unknown restypes. restypes_with_x = restypes + ['X'] restype_order_with_x = { restype: i for i, restype in enumerate(restypes_with_x) } def sequence_to_onehot(sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool=False) -> np.ndarray: """Maps the given sequence into a one-hot encoded matrix. Args: sequence: An amino acid sequence. mapping: A dictionary mapping amino acids to integers. map_unknown_to_x: If True, any amino acid that is not in the mapping will be mapped to the unknown amino acid 'X'. If the mapping doesn't contain amino acid 'X', an error will be thrown. If False, any amino acid not in the mapping will throw an error. Returns: A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of the sequence. Raises: ValueError: If the mapping doesn't contain values from 0 to num_unique_aas - 1 without any gaps. """ num_entries = max(mapping.values()) + 1 if sorted(set(mapping.values())) != list(range(num_entries)): raise ValueError( 'The mapping must have values from 0 to num_unique_aas-1 ' 'without any gaps. Got: %s' % sorted(mapping.values())) one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32) for aa_index, aa_type in enumerate(sequence): if map_unknown_to_x: if aa_type.isalpha() and aa_type.isupper(): aa_id = mapping.get(aa_type, mapping['X']) else: raise ValueError( f'Invalid character in the sequence: {aa_type}') else: aa_id = mapping[aa_type] one_hot_arr[aa_index, aa_id] = 1 return one_hot_arr restype_1to3 = { 'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS', 'Q': 'GLN', 'E': 'GLU', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE', 'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'F': 'PHE', 'P': 'PRO', 'S': 'SER', 'T': 'THR', 'W': 'TRP', 'Y': 'TYR', 'V': 'VAL', } # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple # 1-to-1 mapping of 3 letter names to one letter names. The latter contains # many more, and less common, three letter names as keys and maps many of these # to the same one letter name (including 'X' and 'U' which we don't use here). restype_3to1 = {v: k for k, v in restype_1to3.items()} # Define a restype name for all unknown residues. unk_restype = 'UNK' resnames = [restype_1to3[r] for r in restypes] + [unk_restype] resname_to_idx = {resname: i for i, resname in enumerate(resnames)} # The mapping here uses hhblits convention, so that B is mapped to D, J and O # are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the # remaining 20 amino acids are kept in alphabetical order. # There are 2 non-amino acid codes, X (representing any amino acid) and # "-" representing a missing amino acid in an alignment. The id for these # codes is put at the end (20 and 21) so that they can easily be ignored if # desired. HHBLITS_AA_TO_ID = { 'A': 0, 'B': 2, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'J': 20, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'O': 20, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'U': 1, 'V': 17, 'W': 18, 'X': 20, 'Y': 19, 'Z': 3, '-': 21, } # Partial inversion of HHBLITS_AA_TO_ID. ID_TO_HHBLITS_AA = { 0: 'A', 1: 'C', # Also U. 2: 'D', # Also B. 3: 'E', # Also Z. 4: 'F', 5: 'G', 6: 'H', 7: 'I', 8: 'K', 9: 'L', 10: 'M', 11: 'N', 12: 'P', 13: 'Q', 14: 'R', 15: 'S', 16: 'T', 17: 'V', 18: 'W', 19: 'Y', 20: 'X', # Includes J and O. 21: '-', } restypes_with_x_and_gap = restypes + ['X', '-'] MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple( restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i]) for i in range(len(restypes_with_x_and_gap))) def _make_standard_atom_mask() -> np.ndarray: """Returns [num_res_types, num_atom_types] mask array.""" # +1 to account for unknown (all 0s). mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32) for restype, restype_letter in enumerate(restypes): restype_name = restype_1to3[restype_letter] atom_names = residue_atoms[restype_name] for atom_name in atom_names: atom_type = atom_order[atom_name] mask[restype, atom_type] = 1 return mask STANDARD_ATOM_MASK = _make_standard_atom_mask() # A one hot representation for the first and second atoms defining the axis # of rotation for each chi-angle in each residue. def chi_angle_atom(atom_index: int) -> np.ndarray: """Define chi-angle rigid groups via one-hot representations.""" chi_angles_index = {} one_hots = [] for k, v in chi_angles_atoms.items(): indices = [atom_types.index(s[atom_index]) for s in v] indices.extend([-1] * (4 - len(indices))) chi_angles_index[k] = indices for r in restypes: res3 = restype_1to3[r] one_hot = np.eye(atom_type_num)[chi_angles_index[res3]] one_hots.append(one_hot) one_hots.append(np.zeros([4, atom_type_num])) # Add zeros for residue `X`. one_hot = np.stack(one_hots, axis=0) one_hot = np.transpose(one_hot, [0, 2, 1]) return one_hot chi_atom_1_one_hot = chi_angle_atom(1) chi_atom_2_one_hot = chi_angle_atom(2) # An array like chi_angles_atoms but using indices rather than names. chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes] chi_angles_atom_indices = tree.map_structure( lambda atom_name: atom_order[atom_name], chi_angles_atom_indices) chi_angles_atom_indices = np.array([ chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms))) for chi_atoms in chi_angles_atom_indices ]) # Mapping from (res_name, atom_name) pairs to the atom's chi group index # and atom index within that group. chi_groups_for_atom = collections.defaultdict(list) for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items(): for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res): for atom_i, atom in enumerate(chi_group): chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i)) chi_groups_for_atom = dict(chi_groups_for_atom) def _make_rigid_transformation_4x4(ex, ey, translation): """Create a rigid 4x4 transformation matrix from two axes and transl.""" # Normalize ex. ex_normalized = ex / np.linalg.norm(ex) # make ey perpendicular to ex ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized ey_normalized /= np.linalg.norm(ey_normalized) # compute ez as cross product eznorm = np.cross(ex_normalized, ey_normalized) m = np.stack( [ex_normalized, ey_normalized, eznorm, translation]).transpose() m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0) return m # create an array with (restype, atomtype) --> rigid_group_idx # and an array with (restype, atomtype, coord) for the atom positions # and compute affine transformation matrices (4,4) from one rigid group to the # previous group restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int) restype_atom37_mask = np.zeros([21, 37], dtype=np.float32) restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32) restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int) restype_atom14_mask = np.zeros([21, 14], dtype=np.float32) restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32) restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32) def _make_rigid_group_constants(): """Fill the arrays above.""" for restype, restype_letter in enumerate(restypes): resname = restype_1to3[restype_letter] for atomname, group_idx, atom_position in rigid_group_atom_positions[ resname]: atomtype = atom_order[atomname] restype_atom37_to_rigid_group[restype, atomtype] = group_idx restype_atom37_mask[restype, atomtype] = 1 restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position atom14idx = restype_name_to_atom14_names[resname].index(atomname) restype_atom14_to_rigid_group[restype, atom14idx] = group_idx restype_atom14_mask[restype, atom14idx] = 1 restype_atom14_rigid_group_positions[restype, atom14idx, :] = atom_position for restype, restype_letter in enumerate(restypes): resname = restype_1to3[restype_letter] atom_positions = { name: np.array(pos) for name, _, pos in rigid_group_atom_positions[resname] } # backbone to backbone is the identity transform restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4) # pre-omega-frame to backbone (currently dummy identity matrix) restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4) # phi-frame to backbone mat = _make_rigid_transformation_4x4( ex=atom_positions['N'] - atom_positions['CA'], ey=np.array([1., 0., 0.]), translation=atom_positions['N']) restype_rigid_group_default_frame[restype, 2, :, :] = mat # psi-frame to backbone mat = _make_rigid_transformation_4x4( ex=atom_positions['C'] - atom_positions['CA'], ey=atom_positions['CA'] - atom_positions['N'], translation=atom_positions['C']) restype_rigid_group_default_frame[restype, 3, :, :] = mat # chi1-frame to backbone if chi_angles_mask[restype][0]: base_atom_names = chi_angles_atoms[resname][0] base_atom_positions = [ atom_positions[name] for name in base_atom_names ] mat = _make_rigid_transformation_4x4( ex=base_atom_positions[2] - base_atom_positions[1], ey=base_atom_positions[0] - base_atom_positions[1], translation=base_atom_positions[2]) restype_rigid_group_default_frame[restype, 4, :, :] = mat # chi2-frame to chi1-frame # chi3-frame to chi2-frame # chi4-frame to chi3-frame # luckily all rotation axes for the next frame start at (0,0,0) of the # previous frame for chi_idx in range(1, 4): if chi_angles_mask[restype][chi_idx]: axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2] axis_end_atom_position = atom_positions[axis_end_atom_name] mat = _make_rigid_transformation_4x4( ex=axis_end_atom_position, ey=np.array([-1., 0., 0.]), translation=axis_end_atom_position) restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat _make_rigid_group_constants() def make_atom14_dists_bounds(overlap_tolerance=1.5, bond_length_tolerance_factor=15): """compute upper and lower bounds for bonds to assess violations.""" restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32) restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32) restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32) residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props() for restype, restype_letter in enumerate(restypes): resname = restype_1to3[restype_letter] atom_list = restype_name_to_atom14_names[resname] # create lower and upper bounds for clashes for atom1_idx, atom1_name in enumerate(atom_list): if not atom1_name: continue atom1_radius = van_der_waals_radius[atom1_name[0]] for atom2_idx, atom2_name in enumerate(atom_list): if (not atom2_name) or atom1_idx == atom2_idx: continue atom2_radius = van_der_waals_radius[atom2_name[0]] lower = atom1_radius + atom2_radius - overlap_tolerance upper = 1e10 restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper # overwrite lower and upper bounds for bonds and angles for b in residue_bonds[resname] + residue_virtual_bonds[resname]: atom1_idx = atom_list.index(b.atom1_name) atom2_idx = atom_list.index(b.atom2_name) lower = b.length - bond_length_tolerance_factor * b.stddev upper = b.length + bond_length_tolerance_factor * b.stddev restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev return { 'lower_bound': restype_atom14_bond_lower_bound, # shape (21,14,14) 'upper_bound': restype_atom14_bond_upper_bound, # shape (21,14,14) 'stddev': restype_atom14_bond_stddev, # shape (21,14,14) } ================================================ FILE: ppfleetx/models/protein_folding/template.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from ppfleetx.distributed.protein_folding import dap from .attentions import ( Attention, TriangleMultiplication, TriangleAttention, ) from .common import ( Transition, Dropout, recompute_wrapper, dgram_from_positions, subbatch, ) from . import (residue_constants, ) from . import (quat_affine, ) class TemplatePair(nn.Layer): """Pair processing for the templates. Jumper et al. (2021) Suppl. Alg. 16 "TemplatePairStack" lines 2-6 """ def __init__(self, channel_num, config, global_config): super(TemplatePair, self).__init__() self.config = config self.global_config = global_config channel_num = {} channel_num[ 'pair_channel'] = self.config.triangle_attention_ending_node.value_dim self.triangle_attention_starting_node = TriangleAttention( channel_num, self.config.triangle_attention_starting_node, self.global_config, name='triangle_attention_starting_node') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_attention_starting_node) self.triangle_starting_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.triangle_attention_ending_node = TriangleAttention( channel_num, self.config.triangle_attention_ending_node, self.global_config, name='triangle_attention_ending_node') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_attention_ending_node) self.triangle_ending_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.triangle_multiplication_outgoing = TriangleMultiplication( channel_num, self.config.triangle_multiplication_outgoing, self.global_config, name='triangle_multiplication_outgoing') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_multiplication_outgoing) self.triangle_outgoing_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.triangle_multiplication_incoming = TriangleMultiplication( channel_num, self.config.triangle_multiplication_incoming, self.global_config, name='triangle_multiplication_incoming') dropout_rate, dropout_axis = self._parse_dropout_params( self.triangle_multiplication_incoming) self.triangle_incoming_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) self.pair_transition = Transition( channel_num, self.config.pair_transition, self.global_config, is_extra_msa=False, transition_type='pair_transition') dropout_rate, dropout_axis = self._parse_dropout_params( self.pair_transition) self.pair_transition_dropout = nn.Dropout(dropout_rate, axis=dropout_axis) \ if not self.global_config.use_dropout_nd else Dropout(dropout_rate, axis=dropout_axis) def _parse_dropout_params(self, module): dropout_rate = 0.0 if self.global_config.deterministic else \ module.config.dropout_rate dropout_axis = None if module.config.shared_dropout: dropout_axis = { 'per_row': [0, 2, 3], 'per_column': [0, 1, 3], }[module.config.orientation] return dropout_rate, dropout_axis def forward(self, pair_act, pair_mask): """Builds one block of TemplatePair module. Arguments: pair_act: Pair activations for single template, shape [batch, N_res, N_res, c_t]. pair_mask: Pair mask, shape [batch, N_res, N_res]. Returns: Updated pair_act, shape [batch, N_res, N_res, c_t]. """ pair_mask_row = dap.scatter(pair_mask, axis=1) pair_mask_col = dap.scatter(pair_mask, axis=2) residual = self.triangle_attention_starting_node(pair_act, pair_mask_row) residual = self.triangle_starting_dropout(residual) pair_act = pair_act + residual pair_act = dap.row_to_col(pair_act) residual = self.triangle_attention_ending_node(pair_act, pair_mask_col) residual = self.triangle_ending_dropout(residual) pair_act = pair_act + residual pair_act = dap.col_to_row(pair_act) residual = self.triangle_multiplication_outgoing(pair_act, pair_mask_row) residual = self.triangle_outgoing_dropout(residual) pair_act = pair_act + residual pair_act = dap.row_to_col(pair_act) residual = self.triangle_multiplication_incoming(pair_act, pair_mask_col) residual = self.triangle_incoming_dropout(residual) pair_act = pair_act + residual residual = self.pair_transition(pair_act, pair_mask) residual = self.pair_transition_dropout(residual) pair_act = pair_act + residual pair_act = dap.col_to_row(pair_act) return pair_act class SingleTemplateEmbedding(nn.Layer): """Embeds a single template. Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9+11 """ def __init__(self, channel_num, config, global_config): super(SingleTemplateEmbedding, self).__init__() self.config = config self.channel_num = channel_num self.global_config = global_config Linear = paddle.incubate.nn.FusedLinear if self.global_config.fuse_linear else paddle.nn.Linear self.embedding2d = Linear(channel_num['template_pair'], self.config.template_pair_stack. triangle_attention_ending_node.value_dim) self.template_pair_stack = nn.LayerList() for _ in range(self.config.template_pair_stack.num_block): self.template_pair_stack.append( TemplatePair(self.channel_num, self.config.template_pair_stack, self.global_config)) self.output_layer_norm = nn.LayerNorm(self.config.attention.key_dim) def forward(self, query_embedding, batch, mask_2d): """Build the single template embedding. Arguments: query_embedding: Query pair representation, shape [batch, N_res, N_res, c_z]. batch: A batch of template features (note the template dimension has been stripped out as this module only runs over a single template). mask_2d: Padding mask (Note: this doesn't care if a template exists, unlike the template_pseudo_beta_mask). Returns: A template embedding [N_res, N_res, c_z]. """ assert mask_2d.dtype == query_embedding.dtype dtype = query_embedding.dtype num_res = batch['template_aatype'].shape[1] template_mask = batch['template_pseudo_beta_mask'] # template_mask[..., None] * template_mask[..., None, :] template_mask_2d = template_mask.unsqueeze( axis=-1) * template_mask.unsqueeze(axis=-2) template_mask_2d = template_mask_2d.astype(dtype) template_dgram = dgram_from_positions(batch['template_pseudo_beta'], **self.config.dgram_features) template_dgram = template_dgram.astype(dtype) aatype = nn.functional.one_hot(batch['template_aatype'], 22) aatype = aatype.astype(dtype) to_concat = [template_dgram, template_mask_2d.unsqueeze(axis=-1)] to_concat.append( paddle.tile( aatype.unsqueeze(axis=-3), # aatype[..., None, :, :] [1, num_res, 1, 1])) to_concat.append( paddle.tile( aatype.unsqueeze(axis=-2), # aatype[..., None, :] [1, 1, num_res, 1])) n, ca, c = [residue_constants.atom_order[a] for a in ('N', 'CA', 'C')] rot, trans = quat_affine.make_transform_from_reference( n_xyz=batch['template_all_atom_positions'][..., n, :], ca_xyz=batch['template_all_atom_positions'][..., ca, :], c_xyz=batch['template_all_atom_positions'][..., c, :]) affines = quat_affine.QuatAffine( quaternion=quat_affine.rot_to_quat(rot), translation=trans, rotation=rot) points = [ paddle.unsqueeze( x, axis=-2) for x in paddle.unstack( affines.translation, axis=-1) ] affine_vec = affines.invert_point(points, extra_dims=1) inv_distance_scalar = paddle.rsqrt(1e-6 + sum( [paddle.square(x) for x in affine_vec])) # Backbone affine mask: whether the residue has C, CA, N # (the template mask defined above only considers pseudo CB). template_mask = (batch['template_all_atom_masks'][..., n] * batch['template_all_atom_masks'][..., ca] * batch['template_all_atom_masks'][..., c]) # template_mask[..., None] * template_mask[..., None, :] template_mask_2d = template_mask.unsqueeze( axis=-1) * template_mask.unsqueeze(axis=-2) inv_distance_scalar *= template_mask_2d.astype( inv_distance_scalar.dtype) unit_vector = [(x * inv_distance_scalar).unsqueeze(axis=-1) for x in affine_vec] unit_vector = [x.astype(dtype) for x in unit_vector] if not self.config.use_template_unit_vector: unit_vector = [paddle.zeros_like(x) for x in unit_vector] to_concat.extend(unit_vector) template_mask_2d = template_mask_2d.astype(dtype) to_concat.append(template_mask_2d.unsqueeze(axis=-1)) act = paddle.concat(to_concat, axis=-1) # Mask out non-template regions so we don't get arbitrary values in the # distogram for these regions. act *= template_mask_2d.unsqueeze(axis=-1) act = self.embedding2d(act) act = dap.scatter(act, axis=1) for idx, pair_encoder in enumerate(self.template_pair_stack): act = recompute_wrapper( pair_encoder, act, mask_2d, is_recompute=self.training and idx >= self.config.template_pair_stack.recompute_start_block_index) act = dap.gather(act, axis=1) act = self.output_layer_norm(act) return act class TemplateEmbedding(nn.Layer): """Embeds a set of templates. Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-12 Jumper et al. (2021) Suppl. Alg. 17 "TemplatePointwiseAttention" """ def __init__(self, channel_num, config, global_config): super(TemplateEmbedding, self).__init__() self.config = config self.global_config = global_config self.single_template_embedding = SingleTemplateEmbedding( channel_num, config, global_config) self.attention = Attention( config.attention, global_config, channel_num['pair_channel'], config.attention.key_dim, channel_num['pair_channel']) def forward(self, query_embedding, template_batch, mask_2d): """Build TemplateEmbedding module. Arguments: query_embedding: Query pair representation, shape [n_batch, N_res, N_res, c_z]. template_batch: A batch of template features. mask_2d: Padding mask (Note: this doesn't care if a template exists, unlike the template_pseudo_beta_mask). Returns: A template embedding [n_batch, N_res, N_res, c_z]. """ num_templates = template_batch['template_mask'].shape[1] num_channels = (self.config.template_pair_stack. triangle_attention_ending_node.value_dim) num_res = query_embedding.shape[1] dtype = query_embedding.dtype template_mask = template_batch['template_mask'] template_mask = template_mask.astype(dtype) query_channels = query_embedding.shape[-1] outs = [] for i in range(num_templates): # By default, num_templates = 4 batch0 = { k: paddle.squeeze( v.slice([1], [i], [i + 1]), axis=1) for k, v in template_batch.items() } outs.append( self.single_template_embedding(query_embedding, batch0, mask_2d)) template_pair_repr = paddle.stack(outs, axis=1) flat_query = paddle.reshape( query_embedding, [-1, num_res * num_res, 1, query_channels]) flat_templates = paddle.reshape( paddle.transpose(template_pair_repr, [0, 2, 3, 1, 4]), [-1, num_res * num_res, num_templates, num_channels]) bias = 1e9 * (template_mask[:, None, None, None, :] - 1.) if not self.training: sb_attn = subbatch(self.attention, [0, 1], [1, 1], self.config.subbatch_size, 1) emb = sb_attn(flat_query, flat_templates, bias) else: emb = self.attention(flat_query, flat_templates, bias) emb = paddle.reshape(emb, [-1, num_res, num_res, query_channels]) # No gradients if no templates. emb *= (paddle.sum(template_mask) > 0.).astype(emb.dtype) return emb ================================================ FILE: ppfleetx/models/vision_model/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/models/vision_model/factory.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy import importlib from .vit import * from .loss import * from .metrics import * from .resnet import * from .moco import * from .layers import * __all__ = ['build', ] def build(config): if config is None: return None config = copy.deepcopy(config) model_type = config.pop("name") mod = importlib.import_module(__name__) model = getattr(mod, model_type)(**config) return model ================================================ FILE: ppfleetx/models/vision_model/general_classification_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import copy import importlib from collections import defaultdict import numpy as np import paddle from paddle import LazyGuard from paddle.static import InputSpec from ppfleetx.utils.log import logger from ppfleetx.core.module.basic_module import BasicModule from .factory import build class GeneralClsModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() self.model_configs = copy.deepcopy(configs.Model) self.model_configs.pop('module') # must init before loss function super(GeneralClsModule, self).__init__(configs) assert 'train' in self.model_configs.loss self.loss_fn = build(self.model_configs.loss.train) self.eval_loss_fn = None if 'eval' in self.model_configs.loss: self.eval_loss_fn = build(self.model_configs.loss.eval) if 'train' in self.model_configs.metric: self.train_metric_fn = build(self.model_configs.metric.train) if 'eval' in self.model_configs.metric: self.eval_metric_fn = build(self.model_configs.metric.eval) self.train_batch_size = None self.eval_batch_size = None self.best_metric = 0.0 self.acc_list = [] def get_model(self): if not hasattr(self, 'model') or self.model is None: self.model = build(self.model_configs.model) return self.model def qat_model(self): self.quanter = paddleslim.dygraph.quant.QAT(config=self.qat_config) self.quanter.quantize(self.model) def forward(self, inputs): return self.model(inputs) def training_step(self, batch): inputs, labels = batch if self.train_batch_size is None: self.train_batch_size = inputs.shape[ 0] * paddle.distributed.get_world_size() inputs.stop_gradient = True labels.stop_gradient = True logits = self(inputs) loss = self.loss_fn(logits, labels) return loss def training_step_end(self, log_dict): ips = self.train_batch_size / log_dict['train_cost'] logger.info( "[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec" % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips)) def validation_step(self, batch): inputs, labels = batch batch_size = inputs.shape[0] inputs.stop_gradient = True labels.stop_gradient = True logits = self(inputs) loss = self.eval_loss_fn(logits, labels) if paddle.distributed.get_world_size() > 1: label_list = [] paddle.distributed.all_gather(label_list, labels) labels = paddle.concat(label_list, 0) pred_list = [] paddle.distributed.all_gather(pred_list, logits) logits = paddle.concat(pred_list, 0) if self.eval_batch_size is None: self.eval_batch_size = logits.shape[0] acc = self.eval_metric_fn(logits, labels) self.acc_list.append(acc) return loss def validation_step_end(self, log_dict): ips = self.eval_batch_size / log_dict['eval_cost'] speed = self.configs['Engine']['logging_freq'] / log_dict['eval_cost'] logger.info( "[eval] epoch: %d, step: [%d/%d], loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec" % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['loss'], log_dict['eval_cost'], ips)) def input_spec(self): return [ InputSpec( shape=[None, 3, 224, 224], name="images", dtype='float32') ] def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) def validation_epoch_end(self, log_dict): msg = '' if len(self.acc_list) > 0: ret = defaultdict(list) for item in self.acc_list: for key, val in item.items(): ret[key].append(val) for k, v in ret.items(): ret[k] = np.mean(v) if 'metric' in ret and ret['metric'] > self.best_metric: self.best_metric = ret['metric'] if 'metric' in ret: ret['best_metric'] = self.best_metric msg = ', ' msg += ", ".join([f'{k} = {v:.6f}' for k, v in ret.items()]) self.acc_list.clear() logger.info("[Eval] epoch: %d, total time: %.5f sec%s" % (log_dict['epoch'], log_dict['eval_cost'], msg)) class GeneralClsModuleAuto(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() self.model_configs = copy.deepcopy(configs.Model) self.model_configs.pop('module') # must init before loss function super(GeneralClsModuleAuto, self).__init__(configs) assert 'loss' in self.model_configs self.loss_fn = build(self.model_configs.loss) if 'metric' in self.model_configs: self.metric_fn = build(self.model_configs.metric) def get_model(self): with LazyGuard(): if not hasattr(self, 'model') or self.model is None: self.model = build(self.model_configs.model) return self.model def input_spec(self): return [ InputSpec( shape=[None, 3, 224, 224], name="images", dtype='float32') ] ================================================ FILE: ppfleetx/models/vision_model/layers/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from .mlp import * from .identity import * ================================================ FILE: ppfleetx/models/vision_model/layers/attention.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F from .initializer import xavier_uniform_, zeros_ class ViTAttention(nn.Layer): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): xavier_uniform_(m.weight) zeros_(m.bias) def forward(self, x): N, C = x.shape[1:] qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads)).transpose((2, 0, 3, 1, 4)) q, k, v = qkv[0], qkv[1], qkv[2] attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale attn = nn.functional.softmax(attn, axis=-1) attn = self.attn_drop(attn) x = (paddle.matmul(attn, v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) x = self.proj(x) x = self.proj_drop(x) return x ================================================ FILE: ppfleetx/models/vision_model/layers/droppath.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn def drop_path(x, drop_prob=0., training=False): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... """ if drop_prob == 0. or not training: return x keep_prob = paddle.to_tensor(1 - drop_prob) shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) if x.dtype == paddle.float16: random_tensor = keep_prob + paddle.rand( shape, dtype=paddle.float32).astype(x.dtype) else: random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) random_tensor = paddle.floor(random_tensor) # binarize output = x.divide(keep_prob) * random_tensor return output class DropPath(nn.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) ================================================ FILE: ppfleetx/models/vision_model/layers/embedding.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn class ViTPatchEmbed(nn.Layer): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size) patch_size = patch_size if isinstance(patch_size, tuple) else ( patch_size, patch_size) num_patches = (img_size[1] // patch_size[1]) * \ (img_size[0] // patch_size[0]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.proj = nn.Conv2D( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): B, C, H, W = x.shape assert H == self.img_size[0] and W == self.img_size[1], \ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose((0, 2, 1)) return x ================================================ FILE: ppfleetx/models/vision_model/layers/identity.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn __all__ = ['Identity', ] class Identity(nn.Layer): def __init__(self): super(Identity, self).__init__() def forward(self, input): return input ================================================ FILE: ppfleetx/models/vision_model/layers/initializer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import math from paddle.nn.initializer import Constant, Normal, XavierUniform, Uniform mlp_bias_normal_ = Normal(std=1e-6) pos_normal_ = Normal(std=0.02) xavier_uniform_ = XavierUniform() zeros_ = Constant(value=0.) minus_tens_ = Constant(value=-10.) ones_ = Constant(value=1.) def xavier_uniform_2d_(param, axis=-1): fan_in = int(np.prod(param.shape[:axis])) fan_out = int(np.prod(param.shape[axis:])) limit = math.sqrt(6.0 / (fan_in + fan_out)) uniform = Uniform(low=-limit, high=limit) uniform(param) ================================================ FILE: ppfleetx/models/vision_model/layers/mlp.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn from .initializer import xavier_uniform_, mlp_bias_normal_ __all__ = ['ViTMLP', ] class ViTMLP(nn.Layer): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): xavier_uniform_(m.weight) mlp_bias_normal_(m.bias) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x ================================================ FILE: ppfleetx/models/vision_model/loss/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .cross_entropy import * ================================================ FILE: ppfleetx/models/vision_model/loss/cross_entropy.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn import paddle.nn.functional as F __all__ = [ 'ViTCELoss', 'CELoss', ] class CELoss(nn.Layer): """ Softmax Cross entropy loss """ def __init__(self, epsilon=None): super().__init__() if epsilon is not None: assert epsilon >= 0 and epsilon <= 1, "epsilon must be in [0, 1]" self.epsilon = epsilon def _labelsmoothing(self, target, class_num): if len(target.shape) == 1 or target.shape[-1] != class_num: one_hot_target = F.one_hot(target, class_num) else: one_hot_target = target soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon) soft_target = paddle.reshape(soft_target, shape=[-1, class_num]) return soft_target def forward(self, x, label): if isinstance(x, dict): x = x["logits"] if self.epsilon is not None: class_num = x.shape[-1] label = self._labelsmoothing(label, class_num) x = -F.log_softmax(x, axis=-1) loss = paddle.sum(x * label, axis=-1) else: if label.shape[-1] == x.shape[-1]: loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1) else: if label.dtype == paddle.int32: label = paddle.cast(label, 'int64') loss = F.cross_entropy(x, label=label, soft_label=False) loss = loss.mean() return loss class ViTCELoss(nn.Layer): """ ViT style Sigmoid Cross entropy loss """ def __init__(self, epsilon=None): super().__init__() if epsilon is not None: assert epsilon >= 0 and epsilon <= 1, "epsilon must be in [0, 1]" self.epsilon = epsilon def forward(self, x, label): if isinstance(x, dict): x = x["logits"] class_num = x.shape[-1] if len(label.shape) == 1 or label.shape[-1] != class_num: label = F.one_hot(label, class_num) label = paddle.reshape(label, shape=[-1, class_num]) if self.epsilon is not None: # vit style label smoothing with paddle.no_grad(): label = label * (1.0 - self.epsilon) + self.epsilon if x.dtype == paddle.float16: x = paddle.cast(x, 'float32') loss = F.binary_cross_entropy_with_logits(x, label, reduction='none') loss = paddle.sum(loss, axis=-1) loss = loss.mean() return loss ================================================ FILE: ppfleetx/models/vision_model/metrics/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .accuracy import * ================================================ FILE: ppfleetx/models/vision_model/metrics/accuracy.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddle.nn as nn class TopkAcc(nn.Layer): def __init__(self, topk=(1, 5)): super().__init__() assert isinstance(topk, (int, list, tuple)) if isinstance(topk, int): topk = [topk] self.topk = topk def forward(self, x, label): if isinstance(x, dict): x = x["logits"] if len(label.shape) == 1: label = label.reshape([label.shape[0], -1]) if label.dtype == paddle.int32: label = paddle.cast(label, 'int64') metric_dict = dict() for i, k in enumerate(self.topk): acc = paddle.metric.accuracy(x, label, k=k).item() metric_dict["top{}".format(k)] = acc if i == 0: metric_dict["metric"] = acc return metric_dict ================================================ FILE: ppfleetx/models/vision_model/moco/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .moco import * ================================================ FILE: ppfleetx/models/vision_model/moco/moco.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Callable import os import copy import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn.initializer import Constant, Normal from ..layers.identity import Identity __all__ = [ 'MoCo', 'MoCoV2Projector', 'MoCoClassifier', ] @paddle.no_grad() def concat_all_gather(tensor): """ Performs all_gather operation on the provided tensors. """ if paddle.distributed.get_world_size() < 2: return tensor tensors_gather = [] paddle.distributed.all_gather(tensors_gather, tensor) output = paddle.concat(tensors_gather, axis=0) return output class MoCoV2Projector(nn.Layer): def __init__(self, with_pool, in_dim, out_dim): super().__init__() self.with_pool = with_pool if with_pool: self.avgpool = nn.Sequential( nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1)) self.mlp = nn.Sequential(nn.Linear(in_dim, out_dim), nn.ReLU()) def forward(self, x): if self.with_pool: x = self.avgpool(x) x = self.mlp(x) return x class MoCoClassifier(nn.Layer): def __init__(self, with_pool, num_features, num_classes): super().__init__() self.with_pool = with_pool if with_pool: self.avgpool = nn.Sequential( nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1)) self.fc = nn.Linear(num_features, num_classes) normal_ = Normal(std=0.01) zeros_ = Constant(value=0.) normal_(self.fc.weight) zeros_(self.fc.bias) def forward(self, x): if self.with_pool: x = self.avgpool(x) x = self.fc(x) return x class MoCo(nn.Layer): """ MoCo v1, v2 ref: https://github.com/facebookresearch/moco/blob/main/moco/builder.py ref: https://github.com/PaddlePaddle/PASSL/blob/main/passl/modeling/architectures/moco.py """ def __init__(self, base_encoder, base_projector, base_classifier, momentum_encoder, momentum_projector, momentum_classifier, dim=128, K=65536, m=0.999, T=0.07, **kwargs): super(MoCo, self).__init__() self.m = m self.T = T self.K = K self.base_encoder = nn.Sequential(base_encoder, base_projector, base_classifier) self.momentum_encoder = nn.Sequential( momentum_encoder, momentum_projector, momentum_classifier) for param_b, param_m in zip(self.base_encoder.parameters(), self.momentum_encoder.parameters()): param_m.copy_(param_b, False) # initialize param_m.stop_gradient = True # not update by gradient # create the queue self.register_buffer("queue", paddle.randn([dim, K])) self.queue = F.normalize(self.queue, axis=0) self.register_buffer("queue_ptr", paddle.zeros([1], 'int64')) @paddle.no_grad() def _update_momentum_encoder(self): """Momentum update of the momentum encoder""" #Note(GuoxiaWang): disable auto cast when use mix_precision with paddle.amp.auto_cast(False): for param_b, param_m in zip(self.base_encoder.parameters(), self.momentum_encoder.parameters()): paddle.assign((param_m * self.m + param_b * (1. - self.m)), param_m) param_m.stop_gradient = True @paddle.no_grad() def _dequeue_and_enqueue(self, keys): keys = concat_all_gather(keys) batch_size = keys.shape[0] ptr = int(self.queue_ptr[0]) assert self.K % batch_size == 0 # for simplicity # replace the keys at ptr (dequeue and enqueue) self.queue[:, ptr:ptr + batch_size] = keys.transpose([1, 0]) ptr = (ptr + batch_size) % self.K # move pointer self.queue_ptr[0] = ptr @paddle.no_grad() def _batch_shuffle_ddp(self, x): """ Batch shuffle, for making use of BatchNorm. *** Only support DistributedDataParallel (DDP) model. *** """ # gather from all gpus batch_size_this = x.shape[0] x_gather = concat_all_gather(x) batch_size_all = x_gather.shape[0] num_gpus = batch_size_all // batch_size_this # random shuffle index idx_shuffle = paddle.randperm(batch_size_all) # broadcast to all gpus if paddle.distributed.get_world_size() > 1: paddle.distributed.broadcast(idx_shuffle, src=0) # index for restoring idx_unshuffle = paddle.argsort(idx_shuffle) # shuffled index for this gpu gpu_idx = paddle.distributed.get_rank() idx_this = idx_shuffle.reshape([num_gpus, -1])[gpu_idx] return paddle.gather(x_gather, idx_this, axis=0), idx_unshuffle @paddle.no_grad() def _batch_unshuffle_ddp(self, x, idx_unshuffle): """ Undo batch shuffle. *** Only support DistributedDataParallel (DDP) model. *** """ # gather from all gpus batch_size_this = x.shape[0] x_gather = concat_all_gather(x) batch_size_all = x_gather.shape[0] num_gpus = batch_size_all // batch_size_this # restored index for this gpu gpu_idx = paddle.distributed.get_rank() idx_this = idx_unshuffle.reshape([num_gpus, -1])[gpu_idx] return paddle.gather(x_gather, idx_this, axis=0) def forward(self, x1, x2): # compute query features q = self.base_encoder(x1) # queries: NxC q = F.normalize(q, axis=1) # compute key features with paddle.no_grad(): # no gradient self._update_momentum_encoder() # update the momentum encoder # shuffle for making use of BN k, idx_unshuffle = self._batch_shuffle_ddp(x2) k = self.momentum_encoder(k) # keys: NxC k = F.normalize(k, axis=1) # undo shuffle k = self._batch_unshuffle_ddp(k, idx_unshuffle) # compute logits # Einstein sum is more intuitive # positive logits: Nx1 l_pos = paddle.sum(q * k, axis=1).unsqueeze(-1) # negative logits: NxK l_neg = paddle.matmul(q, self.queue.clone().detach()) # logits: Nx(1+K) logits = paddle.concat((l_pos, l_neg), axis=1) # apply temperature logits /= self.T # labels: positive key indicators labels = paddle.zeros([logits.shape[0]], dtype=paddle.int64) # dequeue and enqueue self._dequeue_and_enqueue(k) return (logits, labels) ================================================ FILE: ppfleetx/models/vision_model/moco_module.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import sys import copy import datetime from collections import defaultdict import numpy as np import paddle import paddle.nn as nn from ppfleetx.utils.log import logger from ppfleetx.core.module.basic_module import BasicModule from .factory import build from .moco import MoCo class MOCOModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() self.model_configs = copy.deepcopy(configs.Model) self.model_configs.pop('module') # must init before loss function super(MOCOModule, self).__init__(configs) assert 'train' in self.model_configs.loss self.loss_fn = build(self.model_configs.loss.train) self.train_batch_size = None self.best_metric = 0.0 def get_model(self): if not hasattr(self, 'model') or self.model is None: config = copy.deepcopy(self.model_configs.model) base_encoder = build(self.model_configs.model.base_encoder) base_projector = build( self.model_configs.model.get('base_projector', {"name": "Identity"})) base_classifier = build(self.model_configs.model.base_classifier) momentum_encoder = build(self.model_configs.model.momentum_encoder) momentum_projector = build( self.model_configs.model.get('momentum_projector', {"name": "Identity"})) momentum_classifier = build( self.model_configs.model.momentum_classifier) config['base_encoder'] = base_encoder config['base_projector'] = base_projector config['base_classifier'] = base_classifier config['momentum_encoder'] = momentum_encoder config['momentum_projector'] = momentum_projector config['momentum_classifier'] = momentum_classifier self.model = MoCo(**config) return self.model def forward(self, img_q, img_k): return self.model(img_q, img_k) def training_step(self, batch): img_q, img_k = batch # Note(GuoxiaWang)paddle.distributed.all_gather required CudaPlace img_q = img_q.cuda() img_k = img_k.cuda() if self.train_batch_size is None: self.train_batch_size = img_q.shape[ 0] * paddle.distributed.get_world_size() logits, labels = self(img_q, img_k) loss = self.loss_fn(logits, labels) return loss def training_step_end(self, log_dict): ips = self.train_batch_size / log_dict['train_cost'] total_step = log_dict['total_epoch'] * log_dict['total_batch'] cur_step = log_dict['epoch'] * log_dict['total_batch'] + log_dict[ 'batch'] + 1 remained_step = total_step - cur_step eta_sec = remained_step * log_dict['train_cost'] eta_msg = "eta: {:s}".format( str(datetime.timedelta(seconds=int(eta_sec)))) logger.info( "[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec, %s" % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips, eta_msg)) def input_spec(self): return [ InputSpec( shape=[None, 3, 224, 224], name="images", dtype='float32') ] def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) class MOCOClsModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() self.model_configs = copy.deepcopy(configs.Model) self.model_configs.pop('module') # must init before loss function super(MOCOClsModule, self).__init__(configs) assert 'train' in self.model_configs.loss self.loss_fn = build(self.model_configs.loss.train) self.eval_loss_fn = None if 'eval' in self.model_configs.loss: self.eval_loss_fn = build(self.model_configs.loss.eval) if 'train' in self.model_configs.metric: self.train_metric_fn = build(self.model_configs.metric.train) if 'eval' in self.model_configs.metric: self.eval_metric_fn = build(self.model_configs.metric.eval) self.train_batch_size = None self.eval_batch_size = None self.best_metric = 0.0 self.acc_list = [] def _freeze_backbone(self, layer): for param in layer.parameters(): param.trainable = False def freeze_norm(layer): if isinstance(layer, (nn.layer.norm._BatchNormBase)): layer._use_global_stats = True layer.apply(freeze_norm) def get_model(self): if not hasattr(self, 'model') or self.model is None: pretrained_path = self.model_configs.model.base_encoder.pop( "pretrained") base_encoder = build(self.model_configs.model.base_encoder) self._freeze_backbone(base_encoder) pretrained_path = pretrained_path + ".pdparams" assert os.path.exists( pretrained_path), f'{pretrained_path} is not exists!' base_encoder_dict = paddle.load(pretrained_path) for k in list(base_encoder_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('base_encoder.0.'): # remove prefix base_encoder_dict[k[len( "base_encoder.0."):]] = base_encoder_dict[k] # delete renamed del base_encoder_dict[k] for name, param in base_encoder.state_dict().items(): if name in base_encoder_dict and param.dtype != base_encoder_dict[ name].dtype: base_encoder_dict[name] = base_encoder_dict[name].cast( param.dtype) base_encoder.set_state_dict(base_encoder_dict) logger.info(f'Load pretrained weight from {pretrained_path}') base_classifier = build(self.model_configs.model.base_classifier) self.model = nn.Sequential(base_encoder, base_classifier) return self.model def forward(self, inputs): return self.model(inputs) def training_step(self, batch): inputs, labels = batch if self.train_batch_size is None: self.train_batch_size = inputs.shape[ 0] * paddle.distributed.get_world_size() inputs.stop_gradient = True labels.stop_gradient = True logits = self(inputs) loss = self.loss_fn(logits, labels) return loss def training_step_end(self, log_dict): ips = self.train_batch_size / log_dict['train_cost'] total_step = log_dict['total_epoch'] * log_dict['total_batch'] cur_step = log_dict['epoch'] * log_dict['total_batch'] + log_dict[ 'batch'] + 1 remained_step = total_step - cur_step eta_sec = remained_step * log_dict['train_cost'] eta_msg = "eta: {:s}".format( str(datetime.timedelta(seconds=int(eta_sec)))) logger.info( "[train] epoch: %d, step: [%d/%d], learning rate: %.7f, loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec, %s" % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['lr'], log_dict['loss'], log_dict['train_cost'], ips, eta_msg)) def validation_step(self, batch): inputs, labels = batch batch_size = inputs.shape[0] inputs.stop_gradient = True labels.stop_gradient = True logits = self(inputs) loss = self.eval_loss_fn(logits, labels) if paddle.distributed.get_world_size() > 1: label_list = [] paddle.distributed.all_gather(label_list, labels) labels = paddle.concat(label_list, 0) pred_list = [] paddle.distributed.all_gather(pred_list, logits) logits = paddle.concat(pred_list, 0) if self.eval_batch_size is None: self.eval_batch_size = logits.shape[0] acc = self.eval_metric_fn(logits, labels) self.acc_list.append(acc) return loss def validation_step_end(self, log_dict): ips = self.eval_batch_size / log_dict['eval_cost'] speed = self.configs['Engine']['logging_freq'] / log_dict['eval_cost'] logger.info( "[eval] epoch: %d, step: [%d/%d], loss: %.9f, batch_cost: %.5f sec, ips: %.2f images/sec" % (log_dict['epoch'], log_dict['batch'], log_dict['total_batch'], log_dict['loss'], log_dict['eval_cost'], ips)) def input_spec(self): return [ InputSpec( shape=[None, 3, 224, 224], name="images", dtype='float32') ] def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) def validation_epoch_end(self, log_dict): msg = '' if len(self.acc_list) > 0: ret = defaultdict(list) for item in self.acc_list: for key, val in item.items(): ret[key].append(val) for k, v in ret.items(): ret[k] = np.mean(v) if 'metric' in ret and ret['metric'] > self.best_metric: self.best_metric = ret['metric'] if 'metric' in ret: ret['best_metric'] = self.best_metric msg = ', ' msg += ", ".join([f'{k} = {v:.6f}' for k, v in ret.items()]) self.acc_list.clear() logger.info("[Eval] epoch: %d, total time: %.5f sec%s" % (log_dict['epoch'], log_dict['eval_cost'], msg)) ================================================ FILE: ppfleetx/models/vision_model/resnet/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from paddle.vision.models.resnet import resnet18, resnet34, resnet50, resnet101, resnet152 __all__ = [ 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', ] ================================================ FILE: ppfleetx/models/vision_model/vit/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .vit import * ================================================ FILE: ppfleetx/models/vision_model/vit/vit.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections.abc import Callable import os import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.incubate.nn import FusedMultiHeadAttention, FusedFeedForward from ppfleetx.utils.log import logger from ..layers.droppath import DropPath from ..layers.identity import Identity from ..layers.attention import ViTAttention from ..layers.embedding import ViTPatchEmbed from ..layers.mlp import ViTMLP from ..layers.initializer import (xavier_uniform_, xavier_uniform_2d_, mlp_bias_normal_, zeros_, minus_tens_, pos_normal_, ones_) __all__ = [ 'ViT_tiny_patch16_224', 'ViT_base_patch16_224', 'ViT_base_patch16_384', 'ViT_base_patch32_224', 'ViT_base_patch32_384', 'ViT_large_patch16_224', 'ViT_large_patch16_384', 'ViT_large_patch32_224', 'ViT_large_patch32_384', 'ViT_huge_patch14_224', 'ViT_huge_patch14_384', 'ViT_g_patch14_224', 'ViT_G_patch14_224', 'ViT_6B_patch14_224', 'ViT', ] class FusedBlock(nn.Layer): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer='nn.LayerNorm', epsilon=1e-5): super().__init__() assert qk_scale is None, "Fused attention doesn't support qk_scale." if isinstance(drop_path, (float, int)): assert drop_path == 0.0, "Fused attention doesn't support drop_path." elif isinstance(drop_path, (tuple, list)): assert drop_path == [0.0] * len( drop_path), "Fused attention doesn't support drop_path." assert norm_layer == "nn.LayerNorm", "Fused attention only support nn.LayerNorm" assert ((act_layer == nn.GELU) or (act_layer == nn.ReLU)) or \ (isinstance(act_layer, str) and act_layer.lower() == "gelu" or act_layer.lower() == "relu"), \ "Fused attention only support GELU and ReLU activation." self.attn = FusedMultiHeadAttention( dim, num_heads=num_heads, qkv_bias_attr=qkv_bias, dropout_rate=drop, attn_dropout_rate=attn_drop, normalize_before=True, epsilon=epsilon) mlp_hidden_dim = int(dim * mlp_ratio) if (act_layer == nn.GELU) or act_layer.lower() == "gelu": act_func = "gelu" else: act_func = "relu" self.mlp = FusedFeedForward( d_model=dim, dim_feedforward=mlp_hidden_dim, dropout_rate=drop, activation=act_func, act_dropout_rate=drop, normalize_before=True) xavier_uniform_2d_(self.attn.qkv_weight) xavier_uniform_2d_(self.attn.linear_weight) xavier_uniform_2d_(self.mlp._linear1_weight) xavier_uniform_2d_(self.mlp._linear2_weight) zeros_(self.attn.qkv_bias) zeros_(self.attn.linear_bias) mlp_bias_normal_(self.mlp._linear1_bias) mlp_bias_normal_(self.mlp._linear2_bias) def forward(self, x): return self.mlp(self.attn(x)) class Block(nn.Layer): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer='nn.LayerNorm', epsilon=1e-5): super().__init__() if isinstance(norm_layer, str): self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) elif isinstance(norm_layer, Callable): self.norm1 = norm_layer(dim) else: raise TypeError( "The norm_layer must be str or paddle.nn.layer.Layer class") self.attn = ViTAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() if isinstance(norm_layer, str): self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) elif isinstance(norm_layer, Callable): self.norm2 = norm_layer(dim) else: raise TypeError( "The norm_layer must be str or paddle.nn.layer.Layer class") mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = ViTMLP( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) def forward(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class ViT(nn.Layer): """ Vision Transformer with support for patch input """ def __init__(self, img_size=224, patch_size=16, in_chans=3, class_num=1000, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer='nn.LayerNorm', epsilon=1e-5, representation_size=None, use_fused_attn=False, **kwargs): super().__init__() self.class_num = class_num self.representation_size = representation_size self.num_heads = num_heads self.num_features = self.embed_dim = embed_dim self.patch_embed = ViTPatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) num_patches = self.patch_embed.num_patches self.pos_embed = self.create_parameter( shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) self.cls_token = self.create_parameter( shape=(1, 1, embed_dim), default_initializer=zeros_) self.pos_drop = nn.Dropout(p=drop_rate) dpr = np.linspace(0, drop_path_rate, depth) self.use_fused_attn = use_fused_attn block_fn = FusedBlock if self.use_fused_attn else Block if self.use_fused_attn: logger.info( "ViT use fused attention. Fused attention model checkpoint will be" \ " saved in normal attention format for inference checkpoint export," \ " and its optimizer checkpoint keeps the same.") self.blocks = nn.LayerList([ block_fn( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, epsilon=epsilon) for i in range(depth) ]) self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) # Classifier head if self.representation_size is not None: self.head0 = nn.Linear(embed_dim, representation_size) self.tanh = nn.Tanh() self.head = nn.Linear(representation_size, class_num) if class_num > 0 else Identity() xavier_uniform_(self.head0.weight) zeros_(self.head0.bias) xavier_uniform_(self.head.weight) minus_tens_(self.head.bias) else: self.head = nn.Linear(embed_dim, class_num) if class_num > 0 else Identity() zeros_(self.head.weight) zeros_(self.head.bias) pos_normal_(self.pos_embed) zeros_(self.cls_token) self.apply(self._init_weights) pretrained_configs = kwargs.pop('pretrained', None) if pretrained_configs is not None: self.load_pretrained(**pretrained_configs) def _init_weights(self, m): if isinstance(m, nn.LayerNorm): zeros_(m.bias) ones_(m.weight) def forward_features(self, x): # B = x.shape[0] B = paddle.shape(x)[0] x = self.patch_embed(x) cls_tokens = self.cls_token.expand((B, -1, -1)) x = paddle.concat((cls_tokens, x), axis=1) x = x + self.pos_embed x = self.pos_drop(x) for blk in self.blocks: x = blk(x) x = self.norm(x) return x[:, 0] def forward(self, x): x = self.forward_features(x) if self.representation_size is not None: x = self.tanh(self.head0(x)) x = self.head(x) return x # Saved the fused attention checkpoint in origin attention checkpoint format replaced_dict = { # FusedMultiHeadAttention 'attn.pre_ln_scale': 'norm1.weight', 'attn.pre_ln_bias': 'norm1.bias', 'attn.qkv_weight': 'attn.qkv.weight', 'attn.qkv_bias': 'attn.qkv.bias', 'attn.linear_weight': 'attn.proj.weight', 'attn.linear_bias': 'attn.proj.bias', # FusedFeedForward 'mlp._ln1_scale': 'norm2.weight', 'mlp._ln1_bias': 'norm2.bias', 'mlp._linear1_weight': 'mlp.fc1.weight', 'mlp._linear1_bias': 'mlp.fc1.bias', 'mlp._linear2_weight': 'mlp.fc2.weight', 'mlp._linear2_bias': 'mlp.fc2.bias', } @paddle.no_grad() def state_dict(self, destination=None, include_sublayers=True, structured_name_prefix="", use_hook=True): state_dict = super().state_dict(destination, include_sublayers, structured_name_prefix, use_hook) if self.use_fused_attn: new_dict = [] poped_keys = [] for key, value in state_dict.items(): new_key = "" for k, v in self.replaced_dict.items(): if k in key: new_key = key.replace(k, v) break if new_key != "": value_name = value.name if 'attn.qkv.weight' in new_key: value = value.reshape([-1, value.shape[-1]]).transpose( [1, 0]) if 'attn.qkv.bias' in new_key: value = value.reshape([-1]) # value is a Tensor after transformation, # it will be transformed to ParamBase for auto_infer param = paddle.create_parameter( shape=value.shape, dtype=value.dtype) param.set_value(value) param.name = value_name new_dict.append({new_key: param}) poped_keys.append(key) for i in range(len(new_dict)): state_dict.update(new_dict[i]) state_dict.pop(poped_keys[i]) return state_dict @paddle.no_grad() def set_state_dict(self, state_dict, use_structured_name=True): reversed_replaced_dict = {} for k, v in self.replaced_dict.items(): reversed_replaced_dict.update({v: k}) if self.use_fused_attn: new_dict = [] poped_keys = [] for key, value in state_dict.items(): new_key = "" for k, v in reversed_replaced_dict.items(): if k in key: new_key = key.replace(k, v) break if new_key != "": if 'attn.qkv_weight' in new_key: value = value.transpose([1, 0]) value = value.reshape( [3, self.num_heads, -1, value.shape[-1]]) if 'attn.qkv_bias' in new_key: value = value.reshape([3, self.num_heads, -1]) new_dict.append({new_key: value}) poped_keys.append(key) for i in range(len(new_dict)): state_dict.update(new_dict[i]) state_dict.pop(poped_keys[i]) super().set_state_dict(state_dict) def load_pretrained(self, prefix_path, finetune=False): if not os.path.exists(prefix_path + '.pdparams'): raise ValueError("Model pretrain path {} does not " "exists.".format(prefix_path)) state_dict = self.state_dict() param_state_dict = paddle.load(prefix_path + ".pdparams") # for FP16 saving pretrained weight for key, value in param_state_dict.items(): param_state_dict[key] = param_state_dict[key].astype( paddle.float32) if not finetune: self.set_state_dict(param_state_dict) return for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']: if k in param_state_dict: print(f"Removing key {k} from pretrained checkpoint") del param_state_dict[k] # interpolate position embedding pos_embed_checkpoint = param_state_dict['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = self.patch_embed.num_patches num_extra_tokens = self.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)** 0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = paddle.transpose( pos_tokens.reshape([-1, orig_size, orig_size, embedding_size]), perm=[0, 3, 1, 2]) dtype = pos_tokens.dtype pos_tokens = paddle.nn.functional.interpolate( pos_tokens.astype(paddle.float32), size=(new_size, new_size), mode='bicubic', align_corners=False).astype(dtype) pos_tokens = paddle.transpose( pos_tokens, perm=[0, 2, 3, 1]).flatten(1, 2) new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1) param_state_dict['pos_embed'] = new_pos_embed self.set_state_dict(param_state_dict) return def ViT_tiny_patch16_224(**kwargs): model = ViT(patch_size=16, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=192, **kwargs) return model def ViT_base_patch16_224(**kwargs): model = ViT(patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=768, **kwargs) return model def ViT_base_patch16_384(**kwargs): model = ViT(img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=None, **kwargs) return model def ViT_base_patch32_224(**kwargs): model = ViT(patch_size=32, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=768, **kwargs) return model def ViT_base_patch32_384(**kwargs): model = ViT(img_size=384, patch_size=32, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=None, **kwargs) return model def ViT_large_patch16_224(**kwargs): model = ViT(patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=1024, **kwargs) return model def ViT_large_patch16_384(**kwargs): model = ViT(img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=None, **kwargs) return model def ViT_large_patch32_224(**kwargs): model = ViT(patch_size=32, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=1024, **kwargs) return model def ViT_large_patch32_384(**kwargs): model = ViT(img_size=384, patch_size=32, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=None, **kwargs) return model def ViT_huge_patch14_224(**kwargs): model = ViT(patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=1280, **kwargs) return model def ViT_huge_patch14_384(**kwargs): model = ViT(img_size=384, patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, epsilon=1e-6, representation_size=None, **kwargs) return model def ViT_g_patch14_224(**kwargs): model = ViT(img_size=224, patch_size=14, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=4.364, qkv_bias=True, epsilon=1e-6, representation_size=1408, **kwargs) return model def ViT_G_patch14_224(**kwargs): model = ViT(img_size=224, patch_size=14, embed_dim=1664, depth=48, num_heads=16, mlp_ratio=4.9231, qkv_bias=True, epsilon=1e-6, representation_size=1664, **kwargs) return model def ViT_6B_patch14_224(**kwargs): model = ViT(img_size=224, patch_size=14, embed_dim=2320, depth=80, num_heads=16, mlp_ratio=4.955, qkv_bias=True, epsilon=1e-6, representation_size=2320, **kwargs) return model ================================================ FILE: ppfleetx/ops/setup_cuda.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from paddle.utils.cpp_extension import CUDAExtension, setup setup( name='ppfleetx_ops', ext_modules=CUDAExtension(sources=['topp_sampling.cu'])) ================================================ FILE: ppfleetx/ops/test_topp_sampling.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import numpy as np from ppfleetx.ops import topp_sampling paddle.seed(2022) x = paddle.randn([1, 51200], dtype="float16") x = paddle.nn.functional.softmax(x) top_ps = paddle.to_tensor(np.random.uniform(0, 1, [1]).astype(np.float16)) out = topp_sampling(x, top_ps) print(out) ================================================ FILE: ppfleetx/ops/topp_sampling.cu ================================================ // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include "cub/cub.cuh" #include "paddle/extension.h" #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") #define FINAL_MASK 0xFFFFFFFF #define FIXED_BLOCK_DIM_BASE(dim, ...) \ case (dim): { \ constexpr auto kBlockDim = (dim); \ __VA_ARGS__; \ } break #define FIXED_BLOCK_DIM(...) \ FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) template class PDTraits; template <> class PDTraits { public: typedef float DataType; typedef float data_t; }; template <> class PDTraits { public: typedef half DataType; typedef paddle::float16 data_t; }; struct SegmentOffsetIter { explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} __host__ __device__ __forceinline__ int operator()(int idx) const { return idx * num_cols_; } int num_cols_; }; template struct Pair { __device__ __forceinline__ Pair() {} __device__ __forceinline__ Pair(T value, int id) : v(value), id(id) {} __device__ __forceinline__ void set(T value, int id) { v = value; id = id; } __device__ __forceinline__ void operator=(const Pair& in) { v = in.v; id = in.id; } __device__ __forceinline__ bool operator<(const T value) const { return ((float)v < (float)value); } __device__ __forceinline__ bool operator>(const T value) const { return ((float)v > (float)value); } __device__ __forceinline__ bool operator<(const Pair& in) const { return ((float)v < (float)in.v) || (((float)v == (float)in.v) && (id > in.id)); } __device__ __forceinline__ bool operator>(const Pair& in) const { return ((float)v > (float)in.v) || (((float)v == (float)in.v) && (id < in.id)); } T v; int id; }; inline int div_up(int a, int n) { return (a + n - 1) / n; } __global__ void setup_kernel(curandState_t *state, const uint64_t seed, const int bs) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for (int i = idx; i < bs; i += gridDim.x * blockDim.x) { curand_init(seed, 0, i, &state[i]); } } template __device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, int beam_size) { for (int k = beam_size - 2; k >= 0; k--) { if (topk[k] < p) { topk[k + 1] = topk[k]; } else { topk[k + 1] = p; return; } } topk[0] = p; } template __device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, int dim, int beam_size) { while (idx < dim) { if (topk[beam_size - 1] < src[idx]) { Pair tmp(src[idx], idx); AddTo(topk, tmp, beam_size); } idx += BlockSize; } } template __device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, int dim, const Pair& max, int beam_size) { while (idx < dim) { if (topk[beam_size - 1] < src[idx]) { Pair tmp(src[idx], idx); if (tmp < max) { AddTo(topk, tmp, beam_size); } } idx += BlockSize; } } template __device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, int beam_size, const T* src, bool* firstStep, bool* is_empty, Pair* max, int dim, const int tid) { if (*beam > 0) { int length = (*beam) < beam_size ? *beam : beam_size; if (*firstStep) { *firstStep = false; GetTopK(topk, src, tid, dim, length); } else { for (int k = 0; k < MaxLength; k++) { if (k < MaxLength - (*beam)) { topk[k] = topk[k + *beam]; } else { topk[k].set(std::numeric_limits::min(), -1); } } if (!(*is_empty)) { GetTopK( topk + MaxLength - *beam, src, tid, dim, *max, length); } } *max = topk[MaxLength - 1]; if ((*max).id == -1) *is_empty = true; *beam = 0; } } template __forceinline__ __device__ Pair WarpReduce(Pair input) { #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { T tmp_val = __shfl_down_sync(FINAL_MASK, input.v, static_cast(offset), 32); int tmp_id = __shfl_down_sync(FINAL_MASK, input.id, static_cast(offset), 32); if ((float)input.v < (float)tmp_val) { input.v = tmp_val; input.id = tmp_id; } } return input; } template __device__ __forceinline__ void BlockReduce(Pair shared_max[], Pair topk[], Pair beam_max[], int* beam, int* k, int *count, const int tid, const int wid, const int lane) { while (true) { __syncthreads(); Pair input_now = topk[0]; input_now = WarpReduce(input_now); if (lane == 0) { shared_max[wid] = input_now; } __syncthreads(); input_now = (tid < BlockSize / 32) ? shared_max[lane] : Pair(std::numeric_limits::min(), -1); if (wid == 0) { input_now = WarpReduce(input_now); if (lane == 0) shared_max[0] = input_now; } __syncthreads(); if (tid == 0) { beam_max[*count] = shared_max[0]; (*count)++; } int tid_max = shared_max[0].id % BlockSize; if (tid == tid_max) { (*beam)++; } if (--(*k) == 0) break; __syncthreads(); if (tid == tid_max) { if (*beam < MaxLength) { topk[0] = topk[*beam]; } } if (MaxLength < 5) { if (*beam >= MaxLength) break; } else { unsigned mask = 0u; mask = __ballot_sync(FINAL_MASK, true); if (tid_max / 32 == wid) { if (__shfl_down_sync(FINAL_MASK, *beam, tid_max % 32, 32) == MaxLength) break; } } } } template __global__ void KeMatrixTopPBeamTopK(const T* src, T *top_ps, int64_t *out_id, // topk id T *out_val, // topk val int vocab_size, curandState_t *state, int *count_iter, int *count_iter_begin) { const int tid = threadIdx.x; const int wid = tid / 32; const int lane = tid % 32; const int bid = blockIdx.x; int top_num = TopPBeamTopK; float top_p_num = (float)top_ps[bid]; __shared__ Pair shared_max[BlockSize / 32]; __shared__ Pair beam_max[TopPBeamTopK]; Pair topk[MaxLength]; int beam = MaxLength; Pair max; bool is_empty = false; bool firststep = true; __shared__ int count; if (tid == 0) { count = 0; } for (int j = 0; j < MaxLength; j++) { topk[j].set(std::numeric_limits::min(), -1); } while (top_num) { ThreadGetTopK(topk, &beam, TopPBeamTopK, src + bid * vocab_size, &firststep, &is_empty, &max, vocab_size, tid); BlockReduce(shared_max, topk, beam_max, &beam, &top_num, &count, tid, wid, lane); } if (tid == 0) { count_iter_begin[bid] = count_iter[bid]; float rand_top_p = curand_uniform(state + bid) * top_p_num; top_ps[bid] = (T)rand_top_p; float sum_prob = 0.0f; #pragma unroll for(int i = 0; i < TopPBeamTopK; i++) { sum_prob += (float)(beam_max[i].v); if(sum_prob >= rand_top_p) { count_iter_begin[bid] += 1; out_id[bid] = (int64_t)beam_max[i].id; out_val[bid] = beam_max[i].v; break; } } } } __global__ void SetCountIter(int *count_iter, int num) { int tid = threadIdx.x; int bid = blockIdx.x; int idx = bid * blockDim.x + tid; for (int i = idx; i < num; i += gridDim.x * blockDim.x) { count_iter[i] = i; } } template __global__ void FillIndex(T* indices, T num_rows, T num_cols) { int col_id = threadIdx.x; int row_id = blockIdx.x; for (T j = row_id; j < num_rows; j += gridDim.x) { for (T i = col_id; i < num_cols; i += blockDim.x) { indices[j * num_cols + i] = i; } } } struct BlockPrefixCallbackOp { // Running prefix float running_total; // Constructor __device__ BlockPrefixCallbackOp(float running_total): running_total(running_total) {} // Callback operator to be entered by the first warp of threads in the block. // Thread-0 is responsible for returning a value for seeding the block-wide scan. __device__ float operator()(float block_aggregate) { float old_prefix = running_total; running_total += block_aggregate; return old_prefix; } }; template __global__ void topp_sampling(T *sorted_probs, int64_t *sorted_id, T *out_val, int64_t *out_id, const T *top_ps, int p_num, int vocab_size, int *count_iter, int *count_iter_begin) { __shared__ int stop_shared; __shared__ float rand_p; const int tid = threadIdx.x; const int bid = blockIdx.x; constexpr int WARP_SIZE = 32; constexpr int NUM_WARPS = BLOCK_SIZE / WARP_SIZE; const int lane_id = tid % WARP_SIZE; const int warp_id = tid / WARP_SIZE; const float p_t = (float)top_ps[bid]; if (tid == 0) { stop_shared = 0; rand_p = p_t; } if (count_iter_begin[bid] == count_iter[bid + 1]) { // topk return; } typedef cub::BlockScan BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; __shared__ uint32_t selected_shared[NUM_WARPS]; // Initialize running total BlockPrefixCallbackOp prefix_op(0); if (lane_id == 0) { selected_shared[warp_id] = 0; } __syncthreads(); int offset = bid * vocab_size; int end = ((vocab_size + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE; int i_activate = 0; float thread_offset = 0; for (int i = tid; i < end; i += BLOCK_SIZE) { float thread_count = (i < vocab_size) ? (float)sorted_probs[offset + i] : 0.f; BlockScan(temp_storage).InclusiveSum(thread_count, thread_offset, prefix_op); uint32_t activate_mask = __ballot_sync(FINAL_MASK, rand_p <= thread_offset); i_activate = i; if (activate_mask != 0) { if (lane_id == 0) { atomicAdd(&stop_shared, 1); selected_shared[warp_id] = activate_mask; } } __syncthreads(); if(stop_shared > 0) { break; } } bool skip = (selected_shared[warp_id] > 0) ? false : true; for (int i=0; i < warp_id; i++) { if(selected_shared[i] != 0) { skip = true; } } if (!skip) { int active_lane_id = WARP_SIZE - __popc(selected_shared[warp_id]); // first not 0 if (lane_id == active_lane_id) { // printf("active_lane_id: %d, i_activate: %d.\n", active_lane_id, i_activate); // for (int i=0; i < active_lane_id; i++) { // printf("p %d, value: %f\n", i, (float)(sorted_probs[offset + i])); // } out_id[bid] = sorted_id[offset + i_activate]; out_val[bid] = sorted_probs[offset + i_activate]; } } } int GetBlockSize(int vocab_size) { if (vocab_size > 512) { return 1024; } else if (vocab_size > 256) { return 512; } else if (vocab_size > 128) { return 256; } else if (vocab_size > 64) { return 128; } else { return 64; } } template __global__ void print_kernel(T *input, int size) { printf("["); for (int i=0; i < size; i++) { if (i != size-1) { printf("%f, ", (float)input[i]); } else { printf("%f]\n", (float)input[i]); } } } template std::vector top_p_sampling_kernel(const paddle::Tensor& x, const paddle::Tensor& top_ps, int random_seed) { typedef PDTraits traits_; typedef typename traits_::DataType DataType_; typedef typename traits_::data_t data_t; std::vector shape = x.shape(); auto cu_stream = x.stream(); int bs = shape[0]; int p_num = top_ps.numel(); PD_CHECK(bs == p_num, "PD_CHECK returns ", false, ", expected bs == p_num."); int vocab_size = shape[1]; auto topp_ids = paddle::full({bs, 1}, 1, paddle::DataType::INT64, x.place()); auto topp_probs = paddle::full({bs, 1}, 1, x.dtype(), x.place()); auto inds_input = paddle::full({bs, vocab_size}, 1, paddle::DataType::INT64, x.place()); auto sorted_out = paddle::full({bs, vocab_size}, 1, x.dtype(), x.place()); auto sorted_id = paddle::full({bs, vocab_size}, 1, paddle::DataType::INT64, x.place()); int BlockSize = GetBlockSize(vocab_size); switch (BlockSize) { FIXED_BLOCK_DIM(FillIndex<<>>(inds_input.data(), bs, vocab_size)); default: PD_THROW("the input data shape has error in the FillIndex kernel."); } static int count = 0; static curandState_t* dev_curand_states; if (count == 0) { #if CUDA_VERSION >= 11020 cudaMallocAsync(&dev_curand_states, bs * sizeof(curandState_t), cu_stream); #else cudaMalloc(&dev_curand_states, bs * sizeof(curandState_t)); #endif } srand((unsigned int)(time(NULL))); setup_kernel<<<1, 256, 0, cu_stream>>>(dev_curand_states, rand() % random_seed, bs); PD_CHECK(bs == p_num, "PD_CHECK returns ", false, ", expected bs == p_num."); auto count_iter = paddle::empty({bs + 1}, paddle::DataType::INT32, x.place()); auto count_iter_begin = paddle::empty({bs}, paddle::DataType::INT32, x.place()); SetCountIter<<<1, 256, 0, cu_stream>>>(count_iter.data(), bs + 1); constexpr int TopKMaxLength = 1; constexpr int TopPBeamTopK = 1; switch (BlockSize) { FIXED_BLOCK_DIM( KeMatrixTopPBeamTopK<<>>( reinterpret_cast(const_cast(x.data())), reinterpret_cast(const_cast(top_ps.data())), topp_ids.data(), reinterpret_cast(topp_probs.data()), vocab_size, dev_curand_states, count_iter.data(), count_iter_begin.data())); default: PD_THROW("the input data shape has error in the topp_beam_topk kernel."); } // if (count % random_seed == random_seed - 1) { // #if CUDA_VERSION >= 11020 // cudaFreeAsync(dev_curand_states, cu_stream); // #else // cudaFree(dev_curand_states); // #endif // } count++; size_t temp_storage_bytes = 0; cub::TransformInputIterator segment_offsets_t_begin(count_iter_begin.data(), SegmentOffsetIter(vocab_size)); cub::TransformInputIterator segment_offsets_t_end(count_iter.data(), SegmentOffsetIter(vocab_size)); DataType_ *x_ptr = reinterpret_cast(const_cast(x.data())); DataType_ *sorted_out_ptr = reinterpret_cast(const_cast(sorted_out.data())); int64_t *in_id_ptr = inds_input.data(); int64_t *out_id_ptr = sorted_id.data(); cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, x_ptr, sorted_out_ptr, in_id_ptr, out_id_ptr, vocab_size * bs, bs, segment_offsets_t_begin, segment_offsets_t_end + 1, 0, sizeof(data_t) * 8, cu_stream); temp_storage_bytes = div_up(temp_storage_bytes, 256) * 256; int64_t temp_size = temp_storage_bytes; auto temp_storage = paddle::empty({temp_size}, paddle::DataType::UINT8, x.place()); cub::DeviceSegmentedRadixSort::SortPairsDescending( temp_storage.data(), temp_storage_bytes, x_ptr, sorted_out_ptr, in_id_ptr, out_id_ptr, vocab_size * bs, bs, segment_offsets_t_begin, segment_offsets_t_end + 1, 0, sizeof(data_t) * 8, cu_stream); switch (BlockSize) { FIXED_BLOCK_DIM( topp_sampling<<>>( sorted_out_ptr, out_id_ptr, reinterpret_cast(topp_probs.data()), topp_ids.data(), reinterpret_cast(const_cast(top_ps.data())), p_num, vocab_size, count_iter.data(), count_iter_begin.data())); default: PD_THROW("the input data shape has error in the topp_sampling kernel."); } return {topp_probs, topp_ids}; } std::vector TopPSampling(const paddle::Tensor& x, const paddle::Tensor& top_ps, int random_seed) { switch (x.type()) { case paddle::DataType::FLOAT16: { return top_p_sampling_kernel( x, top_ps, random_seed ); } case paddle::DataType::FLOAT32: { return top_p_sampling_kernel( x, top_ps, random_seed ); } default: { PD_THROW( "NOT supported data type. " "Only float16 and float32 are supported. "); break; } } } std::vector> TopPSamplingInferShape(const std::vector& x_shape, const std::vector& top_ps_shape) { std::vector out_probs_shape = {x_shape[0], 1}; std::vector out_ids_shape = {x_shape[0], 1}; return {out_probs_shape, out_ids_shape}; } std::vector TopPSamplingInferDtype(const paddle::DataType& x_dtype, const paddle::DataType& top_ps_dtype) { return {x_dtype, paddle::DataType::INT64}; } PD_BUILD_OP(topp_sampling) .Inputs({"x", "top_ps"}) .Outputs({"topp_probs", "topp_ids"}) .Attrs({"random_seed: int"}) .SetKernelFn(PD_KERNEL(TopPSampling)) .SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingInferDtype)); ================================================ FILE: ppfleetx/optims/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import defaultdict import sys import copy import paddle from paddle.optimizer.lr import LRScheduler from .lr_scheduler import * from .optimizer import * from .grad_clip import * from ppfleetx.utils.log import logger def build_lr_scheduler(lr_config): if 'name' in lr_config: lr_name = lr_config.pop('name') lr = eval(lr_name)(**lr_config) if isinstance(lr, LRScheduler): return lr else: return lr() else: lr = lr_config.learning_rate logger.debug("build lr ({}) success..".format(lr)) return lr def build_grad_clip(grad_clip_config): if grad_clip_config is not None: grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') clip_norm = grad_clip_config.get('clip_norm', 1.0) grad_clip = eval(grad_clip_name)( **grad_clip_config) if clip_norm != 0. else None return grad_clip else: return None def build_optimizer(config, model, lr_scheduler=None): config = copy.deepcopy(config) if lr_scheduler is not None: config.pop('lr') multi_precision = config.get('multi_precision', False) if multi_precision: paddle.nn.clip._clip_by_global_norm_using_mp_type(True) grad_clip_config = config.pop('grad_clip', None) grad_clip = build_grad_clip(grad_clip_config) optim_name = config.pop('name') optim = eval(optim_name)(learning_rate=lr_scheduler, parameters=model.parameters(), grad_clip=grad_clip, **config) logger.debug("build optimizer ({}) success..".format(optim)) return optim ================================================ FILE: ppfleetx/optims/grad_clip.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from paddle.nn.clip import ClipGradByGlobalNorm from paddle.nn.clip import ClipGradBase, _squared_l2_norm from paddle.fluid.dygraph import base as imperative_base from paddle.fluid import core, layers from paddle.distributed import collective import paddle.distributed.fleet as fleet from ppfleetx.distributed.apis import env class ClipGradForMOEByGlobalNorm(ClipGradBase): def __init__(self, clip_norm): super(ClipGradForMOEByGlobalNorm, self).__init__() self.clip_norm = float(clip_norm) self.moe_group = None self.world_size = paddle.distributed.get_world_size() if self.world_size > 1: hcg = env.get_hcg() self.moe_group = hcg.get_expert_parallel_group() def __str__(self): return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm) @staticmethod def get_l2_norm_pow(params_grads, sum_dtype=None): sum_square_list = [] sum_square_list_fp16 = [] sum_square_list_fp32 = [] for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) sum_square = _squared_l2_norm(merge_grad) if sum_square.dtype == core.VarDesc.VarType.FP16: sum_square_list_fp16.append(sum_square) elif sum_square.dtype == core.VarDesc.VarType.FP32: sum_square_list_fp32.append(sum_square) else: sum_square_list.append(sum_square) # all parameters have been filterd out if len(sum_square_list) + len(sum_square_list_fp16) + len( sum_square_list_fp32) == 0: return None, None assert sum_dtype in ["float64", "float32", None], \ "sum's type must be float64/ float32 / None" if sum_dtype != "float64": sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32" global_norm_var = [] if len(sum_square_list_fp16) > 0: global_norm_var_fp16 = layers.concat(sum_square_list_fp16) global_norm_var_fp16 = layers.reduce_sum(global_norm_var_fp16) global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) if len(sum_square_list_fp32) > 0: global_norm_var_fp32 = layers.concat(sum_square_list_fp32) global_norm_var_fp32 = layers.reduce_sum(global_norm_var_fp32) if sum_dtype == 'float32': global_norm_var.append(global_norm_var_fp32) else: global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) if len(sum_square_list) > 0: global_norm_var_fp64 = layers.concat(sum_square_list) global_norm_var_fp64 = layers.reduce_sum(global_norm_var_fp64) global_norm_var.append(global_norm_var_fp64) global_norm_var = layers.concat(global_norm_var) global_norm_var = layers.reduce_sum(global_norm_var) return global_norm_var, sum_dtype @imperative_base.no_grad def _dygraph_clip(self, params_grads): normal_params_grads = [] moe_params_grads = [] # separate moe params from normal params if self.moe_group is not None and self.moe_group.nranks > 1: for p, g in params_grads: if "expert" in p.name or "gate" in p.name: moe_params_grads.append((p, g)) else: normal_params_grads.append((p, g)) else: normal_params_grads = params_grads # why to return sum_dtype? # we will call `get_l2_norm_pow` twice and the precisions may be different. # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype global_norm_var_normal, sum_dtype \ = self.get_l2_norm_pow(normal_params_grads) global_norm_var_moe = None if len(moe_params_grads) > 0: global_norm_var_moe, _ \ = self.get_l2_norm_pow(moe_params_grads, sum_dtype) if global_norm_var_moe is not None: collective.all_reduce( global_norm_var_moe, op=collective.ReduceOp.SUM, group=self.moe_group) if global_norm_var_normal is None and global_norm_var_moe is None: return params_grads elif global_norm_var_normal is None: global_norm_var = global_norm_var_moe elif global_norm_var_moe is None: global_norm_var = global_norm_var_normal else: if global_norm_var_normal.dtype != global_norm_var_moe.dtype: # compared with normal norm, moe norm is the later one, # so its precision is no lower than normal norm global_norm_var_normal = \ global_norm_var_normal.astype(global_norm_var_moe.dtype) global_norm_var = global_norm_var_normal + global_norm_var_moe global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, y=layers.elementwise_max( x=global_norm_var, y=max_global_norm)) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if g is None or getattr(p, 'need_clip', True) is False: continue if p.dtype == paddle.float16: g.scale_(clip_var_fp16) else: g.scale_(clip_var) p._reset_grad_inplace_version(True) return params_grads ================================================ FILE: ppfleetx/optims/lr_scheduler.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import numpy import warnings from paddle import Tensor from paddle.optimizer import lr from paddle.optimizer.lr import LRScheduler __all__ = [ 'CosineAnnealingWithWarmupDecay', 'LinearDecayWithWarmup', 'ViTLRScheduler', 'MultiStepDecay', 'CosineDecay', ] class CosineAnnealingWithWarmupDecay(LRScheduler): def __init__(self, max_lr, min_lr, warmup_rate, decay_steps, last_epoch=0, verbose=False, **kwargs): self.decay_steps = decay_steps self.warmup_step = warmup_rate * decay_steps self.max_lr = max_lr self.min_lr = min_lr super(CosineAnnealingWithWarmupDecay, self).__init__( max_lr, last_epoch, verbose) def get_lr(self): if self.warmup_step > 0 and self.last_epoch <= self.warmup_step: return float(self.max_lr) * (self.last_epoch) / self.warmup_step if self.last_epoch > self.decay_steps: return self.min_lr num_step_ = self.last_epoch - self.warmup_step decay_steps_ = self.decay_steps - self.warmup_step decay_ratio = float(num_step_) / float(decay_steps_) coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) return self.min_lr + coeff * (self.max_lr - self.min_lr) def step(self, epoch=None): if epoch is None: self.last_epoch += 0 self.last_lr = self.get_lr() else: self.last_epoch += epoch if hasattr(self, "_get_closed_form_lr"): self.last_lr = self._get_closed_form_lr() else: self.last_lr = self.get_lr() if self.verbose: print('Epoch {}: {} set learning rate to {}.'.format( self.last_epoch, self.__class__.__name__, self.last_lr)) class LinearDecayWithWarmup(LRScheduler): def __init__(self, learning_rate, step_each_epoch, epochs, warmup=0, verbose=False, last_epoch=-1, **kwargs): if kwargs.get('total_steps', -1) > 0: self.T_max = total_steps else: self.T_max = epochs * step_each_epoch self.warmup_steps = warmup if isinstance( warmup, int) else int(math.floor(warmup * self.T_max)) super(LinearDecayWithWarmup, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): if self.last_epoch < self.warmup_steps: return self.base_lr * (float(self.last_epoch) / float(max(1, self.warmup_steps))) return self.base_lr * max(0.0, 1.0 - self.last_epoch / self.T_max) class ViTLRScheduler(LRScheduler): def __init__(self, learning_rate, step_each_epoch, epochs, decay_type='cosine', linear_end=1e-5, warmup_steps=0, verbose=False, last_epoch=-1, **kwargs): self.linear_end = linear_end self.T_max = epochs * step_each_epoch self.warmup_steps = warmup_steps if self.warmup_steps >= self.T_max: self.warmup_steps = self.T_max - 1 self.decay_type = decay_type self.last_epoch = last_epoch super(ViTLRScheduler, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): progress = (self.last_epoch - self.warmup_steps ) / float(self.T_max - self.warmup_steps) progress = min(1.0, max(0.0, progress)) if self.decay_type == 'linear': lr = self.linear_end + (self.base_lr - self.linear_end) * ( 1.0 - progress) elif self.decay_type == 'cosine': lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress)) if self.warmup_steps: lr = lr * min(1.0, self.last_epoch / self.warmup_steps) return lr class MultiStepDecay(lr.MultiStepDecay): def __init__(self, learning_rate, step_each_epoch, epochs, milestones, gamma=0.1, last_epoch=-1, verbose=False, **kwargs): super(MultiStepDecay, self).__init__( learning_rate=learning_rate, milestones=milestones, gamma=gamma, last_epoch=last_epoch, verbose=verbose) class CosineDecay(lr.LRScheduler): def __init__(self, learning_rate, step_each_epoch, epochs, update_unit='epoch', warmups=0, verbose=False, last_epoch=-1, **kwargs): self.T_max = epochs if update_unit == 'epoch' else step_each_epoch * epochs self.warmups = warmups if update_unit == 'epoch' else step_each_epoch * warmups assert self.warmups < self.T_max self.last_epoch = last_epoch super(CosineDecay, self).__init__(learning_rate, last_epoch, verbose) def get_lr(self): progress = ( self.last_epoch - self.warmups) / float(self.T_max - self.warmups) progress = min(1.0, max(0.0, progress)) if self.warmups: lr = lr * min(1.0, self.last_epoch / self.warmups) else: lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress)) return lr ================================================ FILE: ppfleetx/optims/optimizer.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import paddle import paddle.distributed.fleet as fleet from ppfleetx.utils.tensor_fusion_helper import fused_parameters from paddle.optimizer import Adam, AdamW, Momentum from ppfleetx.distributed.apis import env __all__ = [ 'Adam', 'AdamW', 'Momentum', 'FusedAdamW', ] class FusedAdamW(paddle.optimizer.AdamW): def __init__(self, learning_rate, parameters, grad_clip, **config): tensor_fusion = config.pop("tensor_fusion", False) if paddle.distributed.get_world_size() > 1: hcg = env.get_hcg() sharding_size = hcg.get_sharding_parallel_world_size() if tensor_fusion: self.decay_fused_tensors, self.all_fused_tensors = fused_parameters( parameters, sharding_size > 1) decay_params = [p.name for p in self.decay_fused_tensors] else: decay_params = [ p.name for p in parameters if not any(nd in p.name for nd in ["bias", "norm", "b_0"]) ] apply_decay_param_fun = lambda x: x in decay_params super().__init__( learning_rate=learning_rate, parameters=self.all_fused_tensors if tensor_fusion else parameters, grad_clip=grad_clip, apply_decay_param_fun=apply_decay_param_fun, **config) ================================================ FILE: ppfleetx/tools/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/tools/multiprocess_tool.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing from multiprocessing import Process import math import time import os import argparse import warnings """ Multi-process batch processing tool This tool provides a multi-process batch processing method. For example, multi-process batch download data, multi-process preprocessing data, etc. The tool relies on executable shell commands or scripts. Its essence is to use Python's multi-process library to create multiple processes, and call executable commands or scripts through the os.system API. Executable commands or scripts are passed in via a txt text file, organized by line. For example, the following example is download, unzip and delete example. batch_cmd.txt wget http://xxxx.com/0.tar && tar -xf 0.tar && rm 0.tar wget http://xxxx.com/1.tar && tar -xf 1.tar && rm 1.tar ... wget http://xxxx.com/99.tar && tar -xf 99.tar && rm 99.tar How to run: python multiprocess_tool.py --num_proc 10 --shell_cmd_list_filename batch_cmd.txt """ def process_fn(cmd_list): for cmd in cmd_list: try: ret = os.system(cmd) if ret != 0: raise Exception(f'execute command: {cmd} failed.') except Exception as e: print(e) def read_command(shell_cmd_list_filename): shell_cmd_list = [] with open(shell_cmd_list_filename, 'r') as f: for cmd in f: cmd = cmd.strip() shell_cmd_list.append(cmd) return shell_cmd_list def parallel_process(cmd_list, nproc=20): if nproc > multiprocessing.cpu_count(): warnings.warn( 'The set number of processes exceeds the number of cpu cores, please confirm whether it is reasonable.' ) num_cmd = len(cmd_list) num_cmd_part = (num_cmd + nproc - 1) // nproc workers = [] for i in range(min(nproc, num_cmd)): start = i * num_cmd_part end = min(start + num_cmd_part, num_cmd) p = Process(target=process_fn, args=(cmd_list[start:end], )) workers.append(p) p.start() for p in workers: p.join() def main(args): start = time.time() shell_cmd_list = read_command(args.shell_cmd_list_filename) parallel_process(shell_cmd_list, args.num_proc) end = time.time() print("Cost time: {:.2f}".format(end - start)) if __name__ == "__main__": parse = argparse.ArgumentParser( description='multi-process batch processing tool') parse.add_argument('--num_proc', type=int, default=20) parse.add_argument( '--shell_cmd_list_filename', type=str, help='a txt file contains shell command list to be execute.') args = parse.parse_args() main(args) ================================================ FILE: ppfleetx/utils/__init__.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ================================================ FILE: ppfleetx/utils/check.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import paddle from paddle import is_compiled_with_cuda from .log import logger from .device import get_device_and_mapping def check_version(): """ Log error and exit when the installed version of paddlepaddle is not satisfied. """ err = "PaddlePaddle version 1.8.0 or higher is required, " \ "or a suitable develop version is satisfied as well. \n" \ "Please make sure the version is good with your code." try: pass # paddle.utils.require_version('0.0.0') except Exception: logger.error(err) sys.exit(1) def check_device(device): """ Log error and exit when using paddlepaddle cpu version. """ err = "You are using paddlepaddle %s version! Please try to \n" \ "1. install paddlepaddle-%s to run model on %s \nor 2. set the config option 'Global.device' to %s." d, supported_device_map = get_device_and_mapping() assert device in supported_device_map, \ f"the device({device}) to check is not supported by now.Now the paddle only supports: {supported_device_map.keys()}" err = err % (d, device, device, d) try: assert supported_device_map[device] except AssertionError: logger.error(err) sys.exit(1) ================================================ FILE: ppfleetx/utils/compression_helper.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import paddleslim def get_pruned_params(model): params = [] for sublayer in model.sublayers(): for param in sublayer.parameters(include_sublayers=False): if isinstance(sublayer, paddle.nn.layer.common.Linear) or isinstance( sublayer, paddle.distributed.fleet.layers.mpu. mp_layers.ColumnParallelLinear) or isinstance( sublayer, paddle.distributed.fleet.layers. mpu.mp_layers.RowParallelLinear): if len(param.shape) != 2: continue # NOTE(minghaoBD): # 1. param.shape[1] == 3 * param.shape[0]: prune fused-qkv's weight and its next weight: out-linear's weight # 2. param.shape[1] == 4 * param.shape[0]: prune ffn1's weight and its next weight: ffn2's weight # If your model has a different architecture, like your qkv's weights are not fused or ffn1_weight.shape[1] != 4*ffn1_weight.shape[0], you may need to customize this function to suit your model. if param.shape[1] == 3 * param.shape[0] or param.shape[ 1] == 4 * param.shape[0]: params.append(param.name) return params def prune_model(model, configs, inputs_desc=[]): prune_criterion = configs.criterion ratio = configs.ratio shapes, dtypes = [], [] for input_desc in inputs_desc: dtypes.append(input_desc.dtype) new_shape = [10 if item == -1 else item for item in input_desc.shape] shapes.append(new_shape) #TODO(minghaoBD): support ViT and other model architectures in the future num_attention_heads = model.gpt.decoder.layers[0].self_attn.num_heads if prune_criterion == 'l1_norm': pruner = paddleslim.L1NormFilterPruner( model, shapes, skip_leaves=False, prune_type='fc', input_dtype=dtypes[0], num_head=num_attention_heads) elif prune_criterion == 'l2_norm': pruner = paddleslim.L2NormFilterPruner( model, shapes, skip_leaves=False, prune_type='fc', input_dtype=dtypes[0], num_head=num_attention_heads) params = get_pruned_params(model) ratios = {} for param in params: ratios[param] = ratio #NOTE(minghaoBD): hidden size in Layernorm must be 768/1024/2048/4096 for best inference performace, and when axis=0, the hidden size in layernorm will be changed accordingly. So axis=1 is required. plan = pruner.prune_vars(ratios, [1]) def quant_model(model, configs): quanter = paddleslim.dygraph.quant.QAT(configs) return quanter.quantize(model), quanter ================================================ FILE: ppfleetx/utils/config.py ================================================ # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import copy import argparse import yaml import codecs import sys import logging from .log import logger, advertise from . import check import paddle import paddle.distributed as dist import paddle.distributed.auto_parallel as auto from paddle.fluid.reader import use_pinned_memory __all__ = ['get_config', 'print_config'] def process_dist_config(configs): """ process distributed strategy for hybrid parallel """ nranks = dist.get_world_size() config = configs['Distributed'] config.setdefault("hcg", "HybridCommunicateGroup") mp_degree = config.setdefault("mp_degree", 1) pp_degree = config.setdefault("pp_degree", 1) pp_recompute_interval = config.setdefault("pp_recompute_interval", 1) # sharding default sharding_config = config['sharding'] sharding_degree = sharding_config.setdefault("sharding_degree", 1) sharding_stage = sharding_config.setdefault('sharding_stage', 2) sharding_offload = sharding_config.setdefault('sharding_offload', False) reduce_overlap = sharding_config.setdefault('reduce_overlap', False) broadcast_overlap = sharding_config.setdefault('broadcast_overlap', False) other_degree = mp_degree * pp_degree * sharding_degree assert nranks % other_degree == 0, "unreasonable config of dist_strategy." dp_degree = config.setdefault("dp_degree", nranks // other_degree) assert nranks % dp_degree == 0, "unreasonable config of dist_strategy." assert nranks == dp_degree * other_degree, \ "Mismatched config using {} cards with dp_degree[{}]," \ "mp_degree[{}], pp_degree[{}] and sharding_degree[{}]".format(nranks, \ dp_degree, mp_degree, pp_degree, sharding_degree) if sharding_config['sharding_degree'] > 1 and reduce_overlap: if sharding_config['sharding_stage'] == 3 or sharding_config[ 'sharding_offload']: sharding_config['reduce_overlap'] = False logger.warning( "reduce overlap only valid for sharding stage 2 without offload" ) if sharding_config['sharding_degree'] > 1 and broadcast_overlap: if sharding_config['sharding_stage'] == 3 or sharding_config[ 'sharding_offload']: sharding_config['broadcast_overlap'] = False logger.warning( "broadcast overlap only valid for sharding stage 2 without offload" ) if broadcast_overlap and configs['Engine']['logging_freq'] == 1: logger.warning( "Set logging_freq to 1 will disable broadcast_overlap. " "If you want to overlap the broadcast, please increase the logging_freq." ) sharding_config['broadcast_overlap'] = False if sharding_config['sharding_degree'] > 1: if getattr(sharding_config, 'broadcast_overlap', False): logger.warning( "Enable broadcast overlap for sharding will not use pin memory for dataloader" ) use_pinned_memory(False) if 'fuse_sequence_parallel_allreduce' not in config: config['fuse_sequence_parallel_allreduce'] = False if 'use_main_grad' in config and config['use_main_grad'] is True: logger.warning( "If use_main_grad is True, fuse_sequence_parallel_allreduce will be forced to False" ) config['fuse_sequence_parallel_allreduce'] = False def process_global_configs(config): """ process global configs for hybrid parallel """ dp_degree = config['Distributed']['dp_degree'] pp_degree = config['Distributed']['pp_degree'] sharding_degree = config['Distributed']['sharding']['sharding_degree'] config['Global']['enable_partial_send_recv'] = True if 'sequence_parallel' in config['Model'] and pp_degree > 1: if config['Model']['sequence_parallel']: config['Global']['enable_partial_send_recv'] = False logger.warning( "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \ "config.Global.enable_partial_send_recv will be set False." ) global_cfg = config['Global'] # Set environment variable flags = global_cfg.get("flags", {}) paddle.set_flags(flags) for k, v in flags.items(): logger.info("Environment variable {} is set {}.".format(k, v)) if global_cfg['global_batch_size'] is None and global_cfg[ 'local_batch_size'] is None: raise ValueError( "global_batch_size or local_batch_size should be set.") elif global_cfg['global_batch_size'] is not None and global_cfg[ 'local_batch_size'] is not None: assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == (dp_degree * sharding_degree), "global_batch_size[{}] should be divided by local_batch_size[{}] "\ "when dp_degree is [{}] and sharding_degree is [{}]".format(global_cfg['global_batch_size'], global_cfg['local_batch_size'], dp_degree, sharding_degree) elif global_cfg['global_batch_size'] is not None and global_cfg[ 'local_batch_size'] is None: assert global_cfg['global_batch_size'] % (dp_degree * sharding_degree) == 0, \ "global_batch_size[{}] should be divided by dp_degree[{}] times sharding_degree[{}]"\ .format(global_cfg['global_batch_size'], dp_degree, sharding_degree) global_cfg['local_batch_size'] = global_cfg['global_batch_size'] // ( dp_degree * sharding_degree) else: global_cfg['global_batch_size'] = global_cfg[ 'local_batch_size'] * dp_degree * sharding_degree assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0 def process_engine_config(config): """ process engine """ # save_load config.Engine['save_load'] = config.Engine.get('save_load', {}) save_load_cfg = config.Engine.save_load save_steps = save_load_cfg.get('save_steps', None) save_epoch = save_load_cfg.get('save_epoch', None) if save_steps is None or save_steps == -1: save_load_cfg[ 'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint if save_epoch is None or save_epoch == -1: save_load_cfg['save_epoch'] = 1 save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output') save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None) # mix_precision config.Engine['mix_precision'] = config.Engine.get('mix_precision', {}) amp_cfg = config.Engine.mix_precision amp_cfg['enable'] = amp_cfg.get('enable', False) amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768) amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None) amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None) # engine config.Engine['max_steps'] = config.Engine.get('max_steps', 500000) config.Engine['eval_freq'] = config.Engine.get('eval_freq', -1) config.Engine['eval_iters'] = config.Engine.get('eval_iters', 0) config.Engine['logging_freq'] = config.Engine.get('logging_freq', 1) config.Engine['num_train_epochs'] = config.Engine.get('num_train_epochs', 1) config.Engine['test_iters'] = config.Engine['eval_iters'] * 10 \ if config.Engine.get('test_iters', None) is None else config.Engine['test_iters'] config.Engine[ 'accumulate_steps'] = config.Global.local_batch_size // config.Global.micro_batch_size class AttrDict(dict): def __getattr__(self, key): return self[key] def __setattr__(self, key, value): if key in self.__dict__: self.__dict__[key] = value else: self[key] = value def __copy__(self): cls = self.__class__ result = cls.__new__(cls) result.__dict__.update(self.__dict__) return result def __deepcopy__(self, memo): cls = self.__class__ result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): setattr(result, k, copy.deepcopy(v, memo)) for k, v in self.items(): setattr(result, k, copy.deepcopy(v, memo)) return result def setdefault(self, k, default=None): if k not in self or self[k] is None: self[k] = default return default else: return self[k] def create_attr_dict(yaml_config): from ast import literal_eval for key, value in yaml_config.items(): if type(value) is dict: yaml_config[key] = value = AttrDict(value) if isinstance(value, str): try: value = literal_eval(value) except BaseException: pass if isinstance(value, AttrDict): create_attr_dict(yaml_config[key]) else: yaml_config[key] = value def parse_config(cfg_file): """Load a config file into AttrDict""" def _update_dic(dic, base_dic): '''Update config from dic based base_dic ''' base_dic = base_dic.copy() dic = dic.copy() if dic.get('_inherited_', True) == False: dic.pop('_inherited_') return dic for key, val in dic.items(): if isinstance(val, dict) and key in base_dic: base_dic[key] = _update_dic(val, base_dic[key]) else: base_dic[key] = val dic = base_dic return dic def _parse_from_yaml(path): '''Parse a yaml file and build config''' with codecs.open(path, 'r', 'utf-8') as file: dic = yaml.load(file, Loader=yaml.FullLoader) if '_base_' in dic: cfg_dir = os.path.dirname(path) base_path = dic.pop('_base_') base_path = os.path.join(cfg_dir, base_path) base_dic = _parse_from_yaml(base_path) dic = _update_dic(dic, base_dic) return dic yaml_dict = _parse_from_yaml(cfg_file) yaml_config = AttrDict(yaml_dict) create_attr_dict(yaml_config) return yaml_config def print_dict(d, delimiter=0): """ Recursively visualize a dict and indenting acrrording by the relationship of keys. """ placeholder = "-" * 60 for k, v in sorted(d.items()): if isinstance(v, dict): logger.info("{}{} : ".format(delimiter * " ", k)) print_dict(v, delimiter + 4) elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict): logger.info("{}{} : ".format(delimiter * " ", k)) for value in v: print_dict(value, delimiter + 4) else: logger.info("{}{} : {}".format(delimiter * " ", k, v)) if k.isupper(): logger.info(placeholder) def print_config(config): """ visualize configs Arguments: config: configs """ advertise() print_dict(config) def check_config(config): """ Check config """ # global_batch_size = config.get("") global_config = config.get('Global') check.check_version() device = global_config.get('device', 'gpu') device = device.lower() if device in ['gpu', 'xpu', 'rocm', 'npu', "cpu", 'mlu']: check.check_device(device) else: raise ValueError( f"device({device}) is not in ['gpu', 'xpu', 'rocm', 'npu', 'cpu', 'mlu'],\n" "Please ensure the config option Global.device is one of these devices" ) def override(dl, ks, v): """ Recursively replace dict of list Args: dl(dict or list): dict or list to be replaced ks(list): list of keys v(str): value to be replaced """ def str2num(v): try: return eval(v) except Exception: return v assert isinstance(dl, (list, dict)), ("{} should be a list or a dict") assert len(ks) > 0, ('lenght of keys should larger than 0') if isinstance(dl, list): k = str2num(ks[0]) if len(ks) == 1: assert k < len(dl), ('index({}) out of range({})'.format(k, dl)) dl[k] = str2num(v) else: override(dl[k], ks[1:], v) else: if len(ks) == 1: # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl)) if not ks[0] in dl: print('A new field ({}) detected!'.format(ks[0], dl)) dl[ks[0]] = str2num(v) else: if ks[0] not in dl.keys(): dl[ks[0]] = {} print("A new Series field ({}) detected!".format(ks[0], dl)) override(dl[ks[0]], ks[1:], v) def override_config(config, options=None): """ Recursively override the config Args: config(dict): dict to be replaced options(list): list of pairs(key0.key1.idx.key2=value) such as: [ 'topk=2', 'VALID.transforms.1.ResizeImage.resize_short=300' ] Returns: config(dict): replaced config """ if options is not None: for opt in options: assert isinstance(opt, str), ( "option({}) should be a str".format(opt)) assert "=" in opt, ( "option({}) should contain a =" "to distinguish between key and value".format(opt)) pair = opt.split('=') assert len(pair) == 2, ("there can be only a = in the option") key, value = pair keys = key.split('.') override(config, keys, value) return config def get_config(fname, overrides=None, show=False): """ Read config from file """ assert os.path.exists(fname), ( 'config file({}) is not exist'.format(fname)) config = parse_config(fname) override_config(config, overrides) process_dist_config(config) process_global_configs(config) process_engine_config(config) create_attr_dict(AttrDict(config)) if show: print_config(config) check_config(config) return config def process_auto_dist_configs(config): """ process distributed strategy for auto parallel """ configs = config['Distributed'] nranks = dist.get_world_size() mp_degree = configs.setdefault("mp_degree", 1) pp_degree = configs.setdefault("pp_degree", 1) sharding_config = configs['sharding'] sharding_degree = sharding_config.setdefault("sharding_degree", 1) other_degree = mp_degree * pp_degree assert nranks % other_degree == 0, "Requires nranks should be divided by mp_degree*pp_degree." dp_degree = configs.setdefault("dp_degree", nranks // other_degree) assert nranks % dp_degree == 0, "unreasonable config of dist_strategy." assert nranks == dp_degree * other_degree, \ "Mismatched config using {} cards with dp_degree[{}]," \ "mp_degree[{}], pp_degree[{}] and sharding_degree[{}]".format(nranks, \ dp_degree, mp_degree, pp_degree, sharding_degree) def process_auto_global_configs(config): """ process global configs for auto parallel """ dp_degree = config['Distributed']['dp_degree'] pp_degree = config['Distributed']['pp_degree'] # sharding_degree = config['Distributed']['sharding_degree'] config['Global']['enable_partial_send_recv'] = True if config.get('Model', None) is not None and 'sequence_parallel' in config[ 'Model'] and pp_degree > 1: if config['Model']['sequence_parallel']: config['Global']['enable_partial_send_recv'] = False logger.warning( "if config.Distributed.pp_degree > 1 and config.Model.sequence_parallel is True, " \ "config.Global.enable_partial_send_recv will be set False." ) global_cfg = config['Global'] if global_cfg['global_batch_size'] is None and global_cfg[ 'local_batch_size'] is None: raise ValueError( "global_batch_size or local_batch_size should be set.") elif global_cfg['global_batch_size'] is not None and global_cfg[ 'local_batch_size'] is not None: assert global_cfg['global_batch_size'] // global_cfg['local_batch_size'] == dp_degree, \ "global_batch_size[{}] should be divided by local_batch_size[{}] when dp_degree is [{}]"\ .format(global_cfg['global_batch_size'], global_cfg['local_batch_size'], dp_degree) elif global_cfg['global_batch_size'] is not None and global_cfg[ 'local_batch_size'] is None: assert global_cfg['global_batch_size'] % dp_degree == 0, \ "global_batch_size[{}] should be divided by dp_degree[{}]".format(global_cfg['global_batch_size'], dp_degree) global_cfg['local_batch_size'] = global_cfg[ 'global_batch_size'] // dp_degree else: global_cfg['global_batch_size'] = global_cfg[ 'local_batch_size'] * dp_degree assert global_cfg['local_batch_size'] % global_cfg['micro_batch_size'] == 0 def process_auto_engine_configs(config): """ process engine configs for auto parallel """ if config.Engine.get("verbose", None) is None: config.Engine["verbose"] = 2 if config.Engine.get("logging_freq", None) is None: config.Engine["logging_freq"] = 10 config.Engine['save_load'] = config.Engine.get('save_load', {}) save_load_cfg = config.Engine.save_load save_steps = save_load_cfg.get('save_steps', None) save_epoch = save_load_cfg.get('save_epoch', None) if save_steps is None or save_steps == -1: save_load_cfg[ 'save_steps'] = sys.maxsize if sys.version > '3' else sys.maxint if save_epoch is None or save_epoch == -1: save_load_cfg['save_epoch'] = 1 save_load_cfg['output_dir'] = save_load_cfg.get('output_dir', './output') save_load_cfg['ckpt_dir'] = save_load_cfg.get('ckpt_dir', None) config.Engine['max_steps'] = config.Engine.get('max_steps', 500000) config.Engine['eval_freq'] = config.Engine.get('eval_freq', -1) config.Engine['eval_iters'] = config.Engine.get('eval_iters', 0) config.Engine['logging_freq'] = config.Engine.get('logging_freq', 1) config.Engine['num_train_epochs'] = config.Engine.get('num_train_epochs', 1) config.Engine['test_iters'] = config.Engine['eval_iters'] * 10 \ if config.Engine.get('test_iters', None) is None else config.Engine['test_iters'] config.Engine[ 'accumulate_steps'] = config.Global.local_batch_size // config.Global.micro_batch_size def process_auto_strategy(config): """ process auto strategy for auto parallel """ strategy = auto.Strategy() strategy.auto_mode = "semi" strategy.seed = config['Global']['seed'] # amp config amp_cfg = config.Engine.get('mix_precision', {}) amp = strategy.amp amp.enable = amp_cfg.get('enable', False) amp.dtype = amp_cfg.get('dtype', "float16") amp.level = amp_cfg.get('level', "o2") amp.init_loss_scaling = amp_cfg.get('scale_loss', 32768) amp.custom_black_list = amp_cfg.get('custom_black_list', []) amp.custom_white_list = amp_cfg.get('custom_white_list', []) amp.use_fp16_guard = amp_cfg.get('use_fp16_guard', False) amp.use_bf16_guard = amp_cfg.get('use_bf16_guard', False) # recompute config if config.get('Model', None) is not None: if not config.Model.get('no_recompute_layers', None): config.Model['no_recompute_layers'] = [] else: assert isinstance(config.Model['no_recompute_layers'], list), "no_recompute_layers should be a list" for i in config.Model['no_recompute_layers']: assert isinstance( i, int ), "all values in no_recompute_layers should be an integer" assert min(config.Model['no_recompute_layers']) >= 0, \ "the min value in no_recompute_layers should >= 0" assert max(config.Model['no_recompute_layers']) < config.Model['num_layers'], \ "the max value in no_recompute_layers should < num_layers" config.Model['no_recompute_layers'] = sorted( list(set(config.Model['no_recompute_layers']))) recompute = strategy.recompute recompute.enable = config.Model.get('use_recompute', False) recompute.no_recompute_segments = config.Model.pop( 'no_recompute_layers', []) recompute.enable_tuning = config.get( 'Tuning', False) and config.Tuning.get('tuning_recompute', False) # sharding config sharding_cfg = config.Distributed.get('sharding', {}) sharding = strategy.sharding sharding.enable = sharding_cfg.get('sharding_degree', 1) > 1 sharding.degree = sharding_cfg.get('sharding_degree', 1) sharding.stage = sharding_cfg.get('sharding_stage', 1) # gradient merge config gradient_merge = strategy.gradient_merge gradient_merge.enable = config.Engine.get('accumulate_steps') > 1 gradient_merge.k_steps = config.Engine.get('accumulate_steps', 1) # quantization config qat_cfg = config.get('Quantization', {}) qat = strategy.qat qat.enable = qat_cfg.get('enable', False) qat.channel_wise_abs_max = qat_cfg.get('channel_wise_abs_max', True) qat.weight_bits = qat_cfg.get('weight_bits', 8) qat.activation_bits = qat_cfg.get('activation_bits', 8) qat.onnx_format = qat_cfg.get('onnx_format', True) # tuning config tuning_cfg = config.get('Tuning', {}) tuning = strategy.tuning tuning.enable = tuning_cfg.get('enable', False) tuning.profile_start_step = tuning_cfg.get('profile_start_step', 1) tuning.profile_end_step = tuning_cfg.get('profile_end_step', 1) tuning.run_after_tuning = tuning_cfg.get('run_after_tuning', True) tuning.debug = tuning_cfg.get('debug', True) engine_cfg = config['Engine'] engine_cfg['strategy'] = strategy def process_auto_ckpt_dir(config): configs = config["Engine"]["save_load"] ckpt_dir = configs.get("ckpt_dir", None) if ckpt_dir is None: return assert os.path.isdir(ckpt_dir) == False, "Wrong setting of ckpt_dir!ckpt_dir can't be a folder,"\ "but {} is a folder. Your `ckpt_dir` should be `dirname/prefix` like `output/auto`"\ " if your model path is `output/auto_dist0.pdparams`".format(ckpt_dir) assert os.path.exists(ckpt_dir) == False, "Wrong setting of ckpt_dir,"\ "if you want to load weight,you should set ckpt_dir like this!"\ "for example:\ngpt_auto_model_save\n\t--auto_dist0.pdparams\n\t--auto_dist0.pdparams\n"\ "\t--auto_dist0.pdattr\nyou should set ckpt_dir=\"gpt_auto_model_save/auto\"" parent_path = os.path.split(ckpt_dir)[0] if os.path.exists(parent_path) == False: logging.warning("{} path is not existed!we will set ckpt_dir None.". format(parent_path)) configs["ckpt_dir"] == None def get_auto_config(fname, overrides=None, show=False): """ Read config from file for auto parallel """ assert os.path.exists(fname), ( 'config file({}) is not exist'.format(fname)) config = parse_config(fname) override_config(config, overrides) process_auto_dist_configs(config) process_auto_global_configs(config) process_auto_engine_configs(config) process_auto_strategy(config) process_auto_ckpt_dir(config) if show: print_config(config) check_config(config) return config def parse_args(): parser = argparse.ArgumentParser("train script") parser.add_argument( '-c', '--config', type=str, default='configs/config.yaml', help='config file path') parser.add_argument( '-o', '--override', action='append', default=[], help='config options to be overridden') args = parser.parse_args() return args ================================================ FILE: ppfleetx/utils/device.py ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from .log import logger def get_device_and_mapping(): """ Return device type and name-bool mapping implifying which type is supported. """ suppoted_device_map = { "gpu": paddle.is_compiled_with_cuda(), "xpu": paddle.is_compiled_with_xpu(), "rocm": paddle.is_compiled_with_rocm(), "npu": paddle.is_compiled_with_custom_device("npu"), "mlu": 'mlu' in paddle.device.get_all_custom_device_type(), "cpu": True } for d, v in suppoted_device_map.items(): if v: return d, suppoted_device_map def get_device(): """ Return the device with which the paddle is compiled, including 'gpu'(for rocm and gpu), 'npu', 'xpu', 'cpu'. """ d, _ = get_device_and_mapping() return d def synchronize(): """ Synchronize device, return True if succeeded, otherwise return False """ device = paddle.get_device().split(":")[0] if device in ["gpu", "rocm"]: paddle.device.cuda.synchronize() return True elif device == "xpu": paddle.device.xpu.synchronize() return True elif device in paddle.device.get_all_custom_device_type(): paddle.device.synchronize() return True else: logger.warning( "The synchronization is only supported on cuda and xpu now.") return False ================================================ FILE: ppfleetx/utils/download.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time import requests import shutil from ppfleetx.utils.log import logger from tqdm import tqdm import paddle DOWNLOAD_RETRY_LIMIT = 3 def is_url(path): """ Whether path is URL. Args: path (string): URL string or not. """ return path.startswith('http://') or path.startswith('https://') def _map_path(url, root_dir): # parse path after download under root_dir fname = os.path.split(url)[-1] fpath = fname return os.path.join(root_dir, fpath) def cached_path(url_or_path, cache_dir=None): if cache_dir is None: cache_dir = '~/.cache/ppfleetx/' cache_dir = os.path.expanduser(cache_dir) if not os.path.exists(cache_dir): os.makedirs(cache_dir, exist_ok=True) if is_url(url_or_path): path = _map_path(url_or_path, cache_dir) url = url_or_path else: path = url_or_path url = None if os.path.exists(path): logger.info( f"Found {os.path.split(path)[-1]} in cache_dir: {cache_dir}.") return path download(url, path) return path def _download(url, fullname): """ Download from url, save to path. url (str): download url path (str): download to given path """ retry_cnt = 0 while not os.path.exists(fullname): if retry_cnt < DOWNLOAD_RETRY_LIMIT: retry_cnt += 1 else: raise RuntimeError("Download from {} failed. " "Retry limit reached".format(url)) logger.info("Downloading {}".format(url)) try: req = requests.get(url, stream=True) except Exception as e: # requests.exceptions.ConnectionError logger.info("Downloading {} failed {} times with exception {}". format(url, retry_cnt + 1, str(e))) time.sleep(1) continue if req.status_code != 200: raise RuntimeError("Downloading from {} failed with code " "{}!".format(url, req.status_code)) # For protecting download interupted, download to # tmp_fullname firstly, move tmp_fullname to fullname # after download finished tmp_fullname = fullname + "_tmp" total_size = req.headers.get('content-length') with open(tmp_fullname, 'wb') as f: if total_size: with tqdm(total=(int(total_size) + 1023) // 1024) as pbar: for chunk in req.iter_content(chunk_size=1024): f.write(chunk) pbar.update(1) else: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) shutil.move(tmp_fullname, fullname) return fullname def download(url, path): local_rank = 0 world_size = 1 if paddle.fluid.core.is_compiled_with_dist( ) and paddle.distributed.get_world_size() > 1: local_rank = paddle.distributed.ParallelEnv().dev_id world_size = paddle.distributed.get_world_size() if world_size > 1 and local_rank != 0: while not os.path.exists(path): time.sleep(1) else: _download(url, path) ================================================ FILE: ppfleetx/utils/export.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import paddle import logging from .log import logger __all__ = ['export_inference_model'] def _prune_input_spec(input_spec, program, targets): # try to prune static program to figure out pruned input spec # so we perform following operations in static mode device = paddle.get_device() paddle.enable_static() paddle.set_device(device) pruned_input_spec = [] program = program.clone() program = program._prune(targets=targets) global_block = program.global_block() for spec in input_spec: try: v = global_block.var(spec.name) pruned_input_spec.append(spec) except Exception: pass paddle.disable_static(place=device) return pruned_input_spec def export_inference_model( model, input_spec, save_dir='./output', save_name='model', export_quant_model=False, quanter=None, ): if not os.path.exists(save_dir): os.makedirs(save_dir) static_model = paddle.jit.to_static(model, input_spec) pruned_input_spec = _prune_input_spec(input_spec, static_model.forward.main_program, static_model.forward.outputs) if export_quant_model: quanter.save_quantized_model( model, os.path.join(save_dir, save_name), input_spec=pruned_input_spec) logger.info("export quantized inference model saved in {}".format( save_dir)) return paddle.jit.save( static_model, os.path.join(save_dir, save_name), input_spec=pruned_input_spec) logger.info("export inference model saved in {}".format(save_dir)) ================================================ FILE: ppfleetx/utils/file.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import csv import zipfile import tarfile from typing import Iterable, Callable import paddle from ppfleetx.distributed.apis import env @env.work_at_local_rank0 def unzip(zip_path, mode="r", out_dir=None, delete=False): with zipfile.ZipFile(zip_path, mode) as zip_ref: zip_ref.extractall(out_dir) if delete: os.remove(zip_path) @env.work_at_local_rank0 def untar(tar_path, mode="r:gz", out_dir=None, delete=False): try: with tarfile.open(tar_path, 'r:gz') as f: f.extractall(out_dir) finally: if delete: os.remove(tar_path) def parse_csv(path, skip_lines=0, delimiter=' ', quotechar='|', quoting=csv.QUOTE_NONE, map_funcs=None, filter_funcs=None): with open(path, newline='') as csvfile: data = [] spamreader = csv.reader( csvfile, delimiter=delimiter, quotechar=quotechar, quoting=quoting) for idx, row in enumerate(spamreader): if idx < skip_lines: continue filter_flag = True if filter_funcs is not None: if isinstance(filter_funcs, Iterable): for func in filter_funcs: filter_flag = func(row) if filter_flag is False: break else: assert isinstance(filter_funcs, Callable) filter_flag = filter_funcs(row) if filter_flag is False: continue if map_funcs is not None: if isinstance(map_funcs, Iterable): for func in map_funcs: row = func(row) else: assert isinstance(map_funcs, Callable) row = map_funcs(row) data.append(row) return data ================================================ FILE: ppfleetx/utils/log.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import contextlib import copy import functools import logging import os import sys import time import datetime import threading from typing import List import colorlog from colorama import Fore import paddle loggers = {} log_config = { 'DEBUG': { 'level': 10, 'color': 'purple' }, 'INFO': { 'level': 20, 'color': 'green' }, 'TRAIN': { 'level': 21, 'color': 'cyan' }, 'EVAL': { 'level': 22, 'color': 'blue' }, 'WARNING': { 'level': 30, 'color': 'yellow' }, 'ERROR': { 'level': 40, 'color': 'red' }, 'CRITICAL': { 'level': 50, 'color': 'bold_red' } } class Logger(object): ''' Deafult logger in PaddleFleetX Args: name(str) : Logger name, default is 'PaddleFleetX' ''' def __init__(self, name: str=None): name = 'PaddleFleetX' if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items(): logging.addLevelName(conf['level'], key) self.__dict__[key] = functools.partial(self.__call__, conf['level']) self.__dict__[key.lower()] = functools.partial(self.__call__, conf['level']) self.format = colorlog.ColoredFormatter( '%(log_color)s[%(asctime)-15s] [%(levelname)s]%(reset)s - %(message)s', log_colors={ key: conf['color'] for key, conf in log_config.items() }) self.handler = logging.StreamHandler() self.handler.setFormatter(self.format) self.logger.addHandler(self.handler) self.logLevel = 'DEBUG' self.logger.setLevel(logging.DEBUG) self.logger.propagate = False self._is_enable = True def disable(self): self._is_enable = False def enable(self): self._is_enable = True @property def is_enable(self) -> bool: return self._is_enable def __call__(self, log_level: str, msg: str): if not self.is_enable: return self.logger.log(log_level, msg) @contextlib.contextmanager def use_terminator(self, terminator: str): old_terminator = self.handler.terminator self.handler.terminator = terminator yield self.handler.terminator = old_terminator @contextlib.contextmanager def processing(self, msg: str, interval: float=0.1): ''' Continuously print a progress bar with rotating special effects. Args: msg(str): Message to be printed. interval(float): Rotation interval. Default to 0.1. ''' end = False def _printer(): index = 0 flags = ['\\', '|', '/', '-'] while not end: flag = flags[index % len(flags)] with self.use_terminator('\r'): self.info('{}: {}'.format(msg, flag)) time.sleep(interval) index += 1 t = threading.Thread(target=_printer) t.start() yield end = True logger = Logger() def advertise(): """ Show the advertising message like the following: =========================================================== == PaddleFleetX is powered by PaddlePaddle ! == =========================================================== == == == For more info please go to the following website. == == == == https://github.com/PaddlePaddle/PaddleFleetX == =========================================================== """ copyright = "PaddleFleetX is powered by PaddlePaddle !" ad = "For more info please go to the following website." website = "https://github.com/PaddlePaddle/PaddleFleetX" AD_LEN = 6 + len(max([copyright, ad, website], key=len)) logger.info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format( "=" * (AD_LEN + 4), "=={}==".format(copyright.center(AD_LEN)), "=" * (AD_LEN + 4), "=={}==".format(' ' * AD_LEN), "=={}==".format(ad.center(AD_LEN)), "=={}==".format(' ' * AD_LEN), "=={}==".format(website.center(AD_LEN)), "=" * (AD_LEN + 4), )) from .device import synchronize def get_timestamp(): if synchronize(): return time.time() else: logger.warning(f"Device synchronizing failed, which may result uncorrect time") return time.time() def convert_timestamp_to_data(timeStamp): return str(datetime.timedelta(seconds=int(timeStamp))) ================================================ FILE: ppfleetx/utils/tensor_fusion_helper.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from paddle.framework import core import numpy as np from collections import OrderedDict from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_storage import ParamStorage, GradStorage from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import Type alignment = {"gpu": 256, } align = { Type.fp16.value: 2, Type.fp32.value: 4, } def assign_group_by_size(parameters, group_size=256 * 1024 * 1024): is_sparse_gradient = [False] * len(parameters) group_indices = core.eager_assign_group_by_size( parameters, is_sparse_gradient, [group_size, group_size]) var_groups = OrderedDict() for group_idx, indices in enumerate(group_indices): for index in indices: var_groups.setdefault(group_idx, []).append(parameters[index]) return var_groups def flatten_dense_tensors(parameters): _buffer_size = 0 _param2align = {} dtype = parameters[0].dtype for param in parameters: assert param.trainable, "param must be trainable..." size = np.prod(param.shape) * align[dtype] remaining = size % alignment["gpu"] ali = 0 if remaining == 0 else alignment["gpu"] - remaining align_ = ali // align[dtype] _buffer_size += np.prod(param.shape) + align_ _param2align[param.name] = align_ param_storage = ParamStorage(size=_buffer_size, dtype=dtype, device="gpu") param_storage.add_rank_params(parameters, _param2align) # process gradient grad_storage = GradStorage( size=_buffer_size, dtype=dtype, device="gpu", destination="0", parm2align=_param2align) for param in parameters: grad_storage.add_grad(param, _param2align[param.name]) # param_storage --> grad_storage param_storage.buffer._copy_gradient_from(grad_storage.buffer) param_storage.buffer.stop_gradient = False return param_storage, grad_storage def obtain_storage(parameters): if len(parameters) < 1: return [] var_groups = assign_group_by_size(parameters) storage = [] for group_idx, parameters in var_groups.items(): param_storage, grad_storage = flatten_dense_tensors(parameters) storage.append(param_storage.buffer) return storage def fused_parameters(parameters, use_sharding=False): decay_params = [] other_params = [] for param in parameters: if not any(nd in param.name for nd in ["bias", "norm", "b_0"]): decay_params.append(param) else: other_params.append(param) decay_fused = decay_params if use_sharding else obtain_storage( decay_params) other_fused = other_params if use_sharding else obtain_storage( other_params) all_fused = decay_fused + other_fused return decay_fused, all_fused def all_reduce_parameters(params, group): if group.nranks < 2: return div_factor = 1.0 / group.nranks with paddle.framework.no_grad(): for p in params: grad = p.grad.scale_(div_factor) paddle.distributed.all_reduce(grad, group=group) ================================================ FILE: ppfleetx/utils/version.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from ppfleetx.utils.log import logger def version_check(): version = paddle.version.full_version logger.info('run with paddle {}, commit id {}'.format(paddle.__version__, paddle.__git_commit__[:8])) if version != '0.0.0': paddle.utils.require_version(min_version='2.4.0') ================================================ FILE: projects/ernie/auto_export_ernie_345M_mp1.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # 345M mp1 export python -m paddle.distributed.launch --log_dir $log_dir --devices "0" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \ -o Distributed.mp_degree=1 \ ================================================ FILE: projects/ernie/auto_export_ernie_345M_mp2.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # 345M mp2 export python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \ -o Distributed.mp_degree=2 \ ================================================ FILE: projects/ernie/auto_export_ernie_345M_mp2_npu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # 345M mp2 export python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \ -o Distributed.mp_degree=2 \ -o Global.device=npu ================================================ FILE: projects/ernie/auto_export_ernie_345M_mp2_xpu.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir FILENAME=./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_base.yaml sed -i "s/device: gpu/device: xpu/g" $FILENAME export BKCL_PCIE_RING=1 # 345M mp2 export python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/ernie/auto/finetune_ernie_345M_single_card.yaml \ -o Distributed.mp_degree=2 \ ================================================ FILE: projects/ernie/docs/README.md ================================================ # ERNIE: Enhanced Representation through kNowledge IntEgration ## 1. 模型简介 ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框架,它将大数据预训练与多源丰富知识相结合,通过持续学习技术,不断吸收海量文本数据中词汇、结构、语义等方面的知识,实现模型效果不断进化。 ERNIE在情感分析、文本匹配、自然语言推理、词法分析、阅读理解、智能问答等16个公开数据集上全面显著超越世界领先技术,在国际权威的通用语言理解评估基准GLUE上,得分首次突破90分,获得全球第一。 相关创新成果也被国际顶级学术会议AAAI、IJCAI收录。 同时,ERNIE在工业界得到了大规模应用,如搜索引擎、新闻推荐、广告系统、语音交互、智能客服等。 ERNIE 通过建模海量数据中的词、实体及实体关系,学习真实世界的语义知识。相较于 BERT 学习原始语言信号,ERNIE 直接对先验语义知识单元进行建模,增强了模型语义表示能力。 这里我们举个例子: ``` Learnt by BERT :哈 [mask] 滨是 [mask] 龙江的省会,[mask] 际冰 [mask] 文化名城。 Learnt by ERNIE:[mask] [mask] [mask] 是黑龙江的省会,国际 [mask] [mask] 文化名城。 ``` 在 BERT 模型中,我们通过『哈』与『滨』的局部共现,即可判断出『尔』字,模型没有学习与『哈尔滨』相关的任何知识。而 ERNIE 通过学习词与实体的表达,使模型能够建模出『哈尔滨』与『黑龙江』的关系,学到『哈尔滨』是 『黑龙江』的省会以及『哈尔滨』是个冰雪城市。 ### 1.1 目录结构 ```text . ├── docs │   └── inference.md │   └── README.md ├── auto_export_ernie_345M_mp1.sh # 345M ernie-base模型,自动切分单卡导出 ├── auto_export_ernie_345M_mp2.sh # 345M ernie-base模型,自动切分多卡导出 ├── auto_export_ernie_345M_mp2_xpu.sh # 345M ernie-base模型,自动切分多卡导出(XPU) ├── export_ernie_345M_single_card.sh # 345M ernie-base模型,单卡导出 ├── finetune_ernie_345M_single_card.sh # 345M ernie-base模型,单卡finetune训练 ├── inference.py # ernie推理代码 ├── pretrain_ernie_base_175B_mp8_pp16.sh # 175B ernie-base模型,3D混合并行 ├── pretrain_ernie_base_3D.sh # ci测试 ├── pretrain_ernie_base_6.7B_sharding16.sh # 6.7B ernie-base模型,sharding16 ├── pretrain_ernie_base.sh # 345M ernie-base模型,单卡 ├── pretrain_ernie_large.sh # ernie-large模型,单卡 ├── run_inference.sh # ernie 推理运行脚本 ├── run_inference_mp2.sh # ernie 多卡推理运行脚本 └── run_inference_mp2_xpu.sh # ernie 多卡推理运行脚本(XPU) ``` ### 1.2 依赖环境 - paddlenlp - pybind11 安装命令 `pip install pybind11 paddlenlp` ## 2.中文预训练 ERNIE预训练采用的是MLM(Mask Language Model)的训练方式,采用WWM(Whole Word Mask)方式,对于完整语义单元的Token,会同时进行Mask。整体的训练损失loss是mlm_loss + sop_loss。 ### 2.1 小规模语料预训练: 14GB - CLUECorpusSmall
    CLUECorpusSmall 数据准备 #### 数据准备 数据下载部分请参考[data_tools](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/ernie/preprocess/docs/CLUECorpusSmall.md)目录,根据文档中`CLUECorpusSmall 数据集处理教程`,下载数据。下载好后: 解压文件 ```shell unzip comment2019zh_corpus.zip -d clue_corpus_small_14g/comment2019zh_corpus unzip news2016zh_corpus.zip -d clue_corpus_small_14g/news2016zh_corpus unzip webText2019zh_corpus.zip -d clue_corpus_small_14g/webText2019zh_corpus unzip wiki2019zh_corpus.zip -d clue_corpus_small_14g/wiki2019zh_corpus ``` 将txt文件转换为jsonl格式 ``` python ./ppfleetx/data/data_tools/ernie/preprocess/trans_to_json.py --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl ``` 现在我们得到了jsonl格式的数据集,下面是针对训练任务的数据集应用,此处以ernie为例。 ``` python -u ./ppfleetx/data/data_tools/ernie/preprocess/create_pretraining_data.py \ --model_name ernie-1.0-base-zh \ --tokenizer_name ErnieTokenizer \ --input_path clue_corpus_small_14g.jsonl \ --split_sentences\ --chinese \ --cn_whole_word_segment \ --cn_seg_func jieba \ --output_prefix clue_corpus_small_14g_20220104 \ --workers 48 \ --log_interval 10000 ``` 数据共有文档`15702702`条左右,由于分词比较耗时,大概一小时左右可以完成。在当前目录下产出训练所需数据。 ``` clue_corpus_small_14g_20220104_ids.npy clue_corpus_small_14g_20220104_idx.npz ```
    CLUECorpusSmall 开始训练 #### 开始训练 将制作好的数据`clue_corpus_small_14g_20220104_ids.npy,clue_corpus_small_14g_20220104_idx.npz`移动到input_dir中,即可开始训练。 除了单卡训练,飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略,减少显存占用、加速训练,达到大模型可训练且训得快的效果。在模型训练前,需要根据模型规模选择合适的并行策略。下面分别从单卡训练、混合并行训练和自动并行训练三个方面来介绍ERNIE模型训练的配置文件和启动方式。 - 单卡训练 ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 # 345M python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml ``` - 混合并行 ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 # 175B run_pretrain log_dir=log_175B python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml ``` ## 3.下游任务微调 基于训练中产出的checkpoint,用户可以快速对当前模型效果进行评估。PaddleFleetX已经适配了主流下游任务 —— 序列分类,用户可以根据自己的需求,评估自己所需的数据集。 #### 运行实例 - 单卡训练 ``` cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 python tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml ``` - 数据并行 ``` cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 log_dir=log_dp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml \ -o Model.use_recompute=True ```
    ## 3. 推理部署 [推理部署](inference.md) ================================================ FILE: projects/ernie/docs/inference.md ================================================ # 推理部署 模型训练完成后,可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。 ## 1. 模型导出 以`ERNIE(345M)`模型为例 导出单卡`ERNIE(345M)`模型: ```bash sh projects/ernie/auto_export_ernie_345M_mp1.sh ``` 导出多卡`ERNIE(345M)`模型: ```bash sh projects/ernie/auto_export_ernie_345M_mp2.sh ``` 导出多卡`ERNIE(345M)`模型(XPU): ```bash sh projects/ernie/auto_export_ernie_345M_mp2_xpu.sh ``` ## 2. 推理部署 模型导出后,可通过`tasks/ernie/inference.py`脚本进行推理部署。 `ERNIE(345M)` 推理 ```bash bash projects/ernie/run_inference.sh ``` `ERNIE(345M)` 多卡推理 ```bash bash projects/ernie/run_inference_mp2.sh ``` `ERNIE(345M)` 多卡推理(XPU) ```bash bash projects/ernie/run_inference_mp2_xpu.sh ``` ## 3. Benchmark 测试中 ================================================ FILE: projects/ernie/export_ernie_345M_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0 python ./tools/export.py -c ./ppfleetx/configs/nlp/ernie/inference_ernie_345M_single_card.yaml ================================================ FILE: projects/ernie/finetune_ernie_345M_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml ================================================ FILE: projects/ernie/finetune_ernie_345M_single_card_npu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python tools/train.py -c ppfleetx/configs/nlp/ernie/finetune_ernie_345M_single_card.yaml \ -o Global.device=npu \ -o Model.hidden_size=256 ================================================ FILE: projects/ernie/inference.py ================================================ # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../'))) import numpy as np import paddle.distributed.fleet as fleet from ppfleetx.data.tokenizers import GPTTokenizer from ppfleetx.core.engine import InferenceEngine import argparse def parse_args(): parser = argparse.ArgumentParser("ernie inference") parser.add_argument( '-m', '--model_dir', type=str, default='./output', help='model dir') parser.add_argument( '-mp', '--mp_degree', type=int, default=1, help='mp degree') parser.add_argument( '-d', '--device', type=str, default='', help='device type') args = parser.parse_args() return args def main(args): fleet.init(is_collective=True) infer_engine = InferenceEngine( args.model_dir, args.mp_degree, device=args.device) tokenizer = GPTTokenizer.from_pretrained("gpt2") text = 'Hi ERNIE. Tell me who Jack Ma is.' inputs = tokenizer(text, padding=True, return_attention_mask=True) whole_data = [ np.array(inputs['token_type_ids']).reshape(1, -1), np.array(inputs['input_ids']).reshape(1, -1) ] outs = infer_engine.predict(whole_data) print(outs) if __name__ == "__main__": args = parse_args() main(args) ================================================ FILE: projects/ernie/pretrain_ernie_base.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=1 python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_345M_single_card.yaml ================================================ FILE: projects/ernie/pretrain_ernie_base_175B_mp8_pp16.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_hybrid rm -rf $log_dir # 175B run_pretrain python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_175B_mp8_pp16.yaml ================================================ FILE: projects/ernie/pretrain_ernie_base_3D.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_hybrid rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \ -o Data.Train.dataset.input_dir=./dataset/ernie \ -o Data.Eval.dataset.input_dir=./dataset/ernie \ -o Engine.max_steps=10 ================================================ FILE: projects/ernie/pretrain_ernie_base_3D_npu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_hybrid rm -rf $log_dir export PADDLE_P2P_SYNC_SEND=1 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ppfleetx/configs/nlp/ernie/pretrain_ernie_base_3D.yaml \ -o Data.Train.dataset.input_dir=./dataset/ernie \ -o Data.Eval.dataset.input_dir=./dataset/ernie \ -o Engine.max_steps=10 \ -o Global.device=npu ================================================ FILE: projects/ernie/pretrain_ernie_base_6.7B_sharding16.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_hybrid rm -rf $log_dir # 6.7B+sharding16 run_pretrain python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/nlp/ernie/pretrain_ernie_base_6.7B_sharding16.yaml ================================================ FILE: projects/ernie/pretrain_ernie_large.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=1 python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml ================================================ FILE: projects/ernie/pretrain_ernie_large_mp2_mlu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export MLU_VISIBLE_DEVICES=0,1 export PADDLE_XCCL_BACKEND=mlu export FLAGS_selected_mlus=0,1 LOG_DIR=log_ernie LOG_GFILE=log_ernie_large_hybrid mkdir -p ${LOG_DIR} python -m paddle.distributed.launch \ --log_dir ${LOG_DIR} \ --device 0,1 tools/train.py \ -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \ -o Global.device=mlu \ -o Distributed.mp_degree=2 \ -o Distributed.dp_degree=1 \ -o Distributed.pp_degree=1 \ -o Model.use_recompute=Fasle > ${LOG_DIR}/${LOG_GFILE} 2>&1 & ================================================ FILE: projects/ernie/pretrain_ernie_large_mp2_npu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -m paddle.distributed.launch \ --device 0,1 tools/train.py \ -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \ -o Global.device=npu \ -o Distributed.mp_degree=2 \ -o Distributed.dp_degree=1 \ -o Distributed.pp_degree=1 \ -o Model.use_recompute=Fasle ================================================ FILE: projects/ernie/pretrain_ernie_large_mp2_pp2_npu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export PADDLE_P2P_SYNC_SEND=1 python -m paddle.distributed.launch \ --device 0,1,2,3 tools/train.py \ -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \ -o Global.device=npu \ -o Distributed.mp_degree=2 \ -o Distributed.dp_degree=1 \ -o Distributed.pp_degree=2 \ -o Model.use_recompute=True ================================================ FILE: projects/ernie/pretrain_ernie_large_npu.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python tools/train.py -c ppfleetx/configs/nlp/ernie/pretrain_ernie_large_single_card.yaml \ -o Global.device=npu ================================================ FILE: projects/ernie/run_inference.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. unset CUDA_VISIBLE_DEVICES python -u -m paddle.distributed.launch \ --gpus "0" \ --log_dir "log" \ projects/ernie/inference.py --model_dir "./output" --mp_degree 1 ================================================ FILE: projects/ernie/run_inference_mp2.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. unset CUDA_VISIBLE_DEVICES python -u -m paddle.distributed.launch \ --gpus "0,1" \ --log_dir "log" \ projects/ernie/inference.py --model_dir "./output" --mp_degree 2 ================================================ FILE: projects/ernie/run_inference_mp2_npu.sh ================================================ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python -u -m paddle.distributed.launch \ --devices "0,1" \ --log_dir "log" \ projects/ernie/inference.py --model_dir "./output" --mp_degree 2 --device npu ================================================ FILE: projects/ernie/run_inference_mp2_xpu.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export BKCL_PCIE_RING=1 python -u -m paddle.distributed.launch \ --devices "0,1" \ --log_dir "log" \ projects/ernie/inference.py --model_dir "./output" --mp_degree 2 ================================================ FILE: projects/gpt/auto_export_gpt_175B_mp8.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_mp8 rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_175B_mp8.yaml ================================================ FILE: projects/gpt/auto_export_gpt_345M_mp2.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_mp2 rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_mp2.yaml \ ================================================ FILE: projects/gpt/auto_export_gpt_345M_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_345m_mp1 rm -rf $log_dir DIRECTORY=./pretrained if [ ! -d "$DIRECTORY" ]; then echo "start download ckpt" wget https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M_FP16.tar.gz tar -zxvf GPT_345M_FP16.tar.gz fi python -m paddle.distributed.launch --log_dir $log_dir --devices "1" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_345M_single_card.yaml \ -o Engine.save_load.ckpt_dir=./pretrained/auto ================================================ FILE: projects/gpt/auto_export_gpt_6.7B_mp1.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_mp1 rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/gpt/auto/generation_gpt_6.7B_mp1.yaml ================================================ FILE: projects/gpt/auto_export_gpt_fp16_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python ./tools/auto_export.py -c ./ppfleetx/configs/nlp/gpt/auto/export_gpt_fp16_single_card.yaml \ -o Engine.save_load.output_dir="./serial_model" \ -o Engine.save_load.ckpt_dir="./output/rank_0/model" \ ================================================ FILE: projects/gpt/auto_gpt_1.3B_dp8.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # 1.3B+dp8 run_pretrain python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py \ -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml ================================================ FILE: projects/gpt/auto_gpt_1.3B_dp8_tuning.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # 1.3B+dp8 recompute tuning python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py \ -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8_tuning.yaml ================================================ FILE: projects/gpt/auto_gpt_1.3B_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export FLAGS_USE_STANDALONE_EXECUTOR=False export CUDA_VISIBLE_DEVICES=0 python ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml ================================================ FILE: projects/gpt/auto_gpt_345M_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export FLAGS_USE_STANDALONE_EXECUTOR=False export CUDA_VISIBLE_DEVICES=0 python ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_345M_single_card.yaml ================================================ FILE: projects/gpt/auto_gpt_6.7B_sharding16.sh ================================================ #! /bin/bash # Runs the "1.3B" parameter model # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # 6.7B+sharding16 run_pretrain python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py \ -c ./ppfleetx/configs/nlp/gp/auto/pretrain_gpt_6.7B_sharding16.yaml ================================================ FILE: projects/gpt/auto_qat_export_gpt_345M_mp2.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1" \ ./tools/auto_export.py \ -c ./ppfleetx/configs/nlp/gpt/auto/qat_generation_gpt_345M_mp2.yaml \ -o Engine.save_load.output_dir="./mp2_qat_model" \ ================================================ FILE: projects/gpt/benchmark.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time import argparse import numpy as np import paddle import paddle.distributed.fleet as fleet from ppfleetx.core.engine.inference_engine import InferenceEngine import ppfleetx_ops def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--seq_len", default=128, type=int, required=False, help="seq length of inputs") parser.add_argument( "--iter", default=100, type=int, help="run iterations for timing") parser.add_argument("--mp_degree", default=1, type=int, help="") parser.add_argument( "--model_dir", default="output", type=str, help="model directory") args = parser.parse_args() return args def predict(engine, data, args): with engine._static_guard: for d, name in zip(data, engine.input_names()): handle = engine.predictor.get_input_handle(name) handle.copy_from_cpu(d) for _ in range(10): engine.predictor.run() engine.predictor.get_output_handle(engine.output_names()[ 0]).copy_to_cpu() start = time.perf_counter() for _ in range(args.iter): engine.predictor.run() end = time.perf_counter() print( f"batch {args.iter} run time: {1000 * (end - start) / args.iter}ms") return {name: engine.predictor.get_output_handle(name).copy_to_cpu() \ for name in engine.output_names()} def main(): args = parse_args() fleet.init(is_collective=True) infer_engine = InferenceEngine(args.model_dir, args.mp_degree) ids = [100] * args.seq_len # run test for batch in [1, 2, 4, 8, 16]: whole_data = [ids] * batch whole_data = np.array(whole_data, dtype="int64").reshape(1, batch, -1) _ = predict(infer_engine, whole_data, args) if __name__ == "__main__": main() ================================================ FILE: projects/gpt/docs/README.md ================================================ # GPT ## 模型介绍 GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)/[3](https://arxiv.org/pdf/2005.14165.pdf) 是以[Transformer](https://arxiv.org/abs/1706.03762) 解码器为网络基本组件,使用自回归的方式在大规模无标注文本语料上进行预训练得到的语言生成模型。 本项目是语言模型 GPT 的 PaddlePaddle 大模型实现。目前,PaddleFleetX 提供了 [GPT-345M](https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz) 的预训练模型文件;分别基于 [LAMBADA](https://raw.githubusercontent.com/cybertronai/bflm/master/lambada_test.jsonl) 和 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip) 数据集,采用 ACC(accuracy) 和 PPL(perplexity) 指标后的评估结果如下: | **模型文件** | **ACC** | **PPL** | |---------|-----------|---------------| | GPT-345M | 44.17% | 18.01 | 下面是本例的简要目录结构及说明: ```text . ├── auto_export_gpt_345M_mp2.sh # 自动并行345M模型两卡张量并行导出入口 ├── auto_gpt_345M_single_card.sh # 自动并行345M模型单卡预训练入口 ├── auto_gpt_1.3B_single_card.sh # 自动并行1.3B模型单卡预训练入口 ├── auto_gpt_1.3B_dp8.sh # 自动并行1.3B模型数据并行预训练入口 ├── auto_gpt_6.7B_sharding16.sh # 自动并行6.7B模型分组切片并行预训练入口 ├── evaluate_gpt_345M_single_card.sh # 单卡345M模型评估入口 ├── export_gpt_345M_single_card.sh # 单卡345M模型动转静导出入口 ├── finetune_gpt_345M_single_card.sh # 单卡345M模型finetune训练入口 ├── inference_gpt_345M_single_card.sh # 单卡345M模型推理入口 ├── pretrain_gpt_345M_single_card.sh # 单卡345M模型预训练入口 ├── pretrain_gpt_1.3B_single_card.sh # 单卡1.3B模型预训练入口 ├── pretrain_gpt_1.3B_dp8.sh # 8卡1.3B模型数据并行预训练入口 ├── pretrain_gpt_6.7B_sharding16.sh # 16卡6.7B模型分组切片并行预训练入口 ├── pretrain_gpt_175B_mp8_pp16.sh # 128卡175B模型混合并行预训练入口 ├── qat_gpt_345M_single_card.sh # 单卡345M模型量化训练入口 ├── qat_gpt_345M_mp8.sh # 8卡345M模型模型并行量化训练入口 ├── qat_gpt_6.7B_sharding16.sh # 16卡6.7B模型分组切片并行量化训练入口 ├── eval_qat_gpt_345M_single_card.sh # 单卡345M量化模型验证入口 ├── export_qat_gpt_345M_single_card.sh # 单卡345M量化模型导出入口 ``` ## 快速开始 ### 环境依赖 请确保已根据根目录 requirements.txt 安装所需依赖,或者通过以下命令快速安装 ```shell python -m pip install -r https://raw.githubusercontent.com/PaddlePaddle/PaddleFleetX/develop/requirements.txt -i https://mirror.baidu.com/pypi/simple ``` ### 数据准备 数据获取和制作详见[GPT 模型预训练数据准备流程](https://github.com/PaddlePaddle/PaddleFleetX/tree/develop/ppfleetx/data/data_tools/gpt) 为了方便用户运行测试本模型,此处提供处理好的300M的训练样本,在单卡训练或混合并行训练前都需要通过以下命令获取数据。 **数据下载命令** ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 # 下载样例数据 mkdir data && cd data wget -O gpt_en_dataset_300m_ids.npy https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy wget -O gpt_en_dataset_300m_idx.npz https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz cd .. # 回到 PaddleFleetX 根目录下 ``` ### 模型训练 除了单卡训练,飞桨还支持数据并行、混合并行、自动并行、重计算等多种分布式策略,减少显存占用、加速训练,达到大模型可训练且训得快的效果。在模型训练前,需要根据模型规模选择合适的并行策略。下面分别从单卡训练、混合并行训练和自动并行训练三个方面来介绍GPT模型训练的配置文件和启动方式。 - [单卡训练](./single_card.md) - [混合并行训练](./hybrid_parallel.md) - [自动并行训练](./auto_parallel.md) ### 文本生成体验 - [单卡预训练模型文本生成](./single_card.md#GPT-Zero-shot-文本生成) - [混合并行预训练模型文本生成](./hybrid_parallel.md#GPT-Zero-shot-文本生成) ### 模型压缩 - [量化训练](./quantization_aware_training.md) ### 推理部署 - [推理部署](inference.md) ### GLUE 下游任务微调 - [单卡微调](./single_finetune.md) ## 参数释义 ### 全局信息 全局参数指定训练的batch size,以及设备、随机种子等信息。 ```yaml Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | device | 设备信息 | | seed | 随机数种子 | | global_batch_size | 全局的batch size大小,即一次参数更新等效的batch size | | local_batch_size | 每个进程训练的batch size大小 | | micro_batch_size | 每次前向计算的batch size大小 | ### Engine训练控制 Engine训练设置完成模型训练/验证/推理等过程中的参数设置,是fleetX的EagerEngine的必要参数,所有使用该Engine都必须指定该配置。 其中包含的参数有: ```yaml Engine: max_steps: 500000 num_train_epochs: 1 accumulate_steps: logging_freq: 1 eval_freq: 500 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: save_steps: 1000 save_epoch: 1 output_dir: ./output ckpt_dir: ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | max_steps | 最大训练步数 | | num_train_epochs | 训练的epoch数量 | | accumulate_steps | 梯度累加次数 | | logging_freq | 训练日志打印的频率 | | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | | enable | 是否使用混合精度策略进行训练 | | dtype | 混合精度训练数据类型使用float16还是bfloat16,默认为float16类型 | | level | 混合精度训练模式,默认``O2``模式 | | scale_loss | 使用fp16混合精度策略下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算 | | save_steps | 保存模型间隔step数 | | save_epoch | 保存模型间隔epoch数 | | output_dir | 指定输出文件 | | ckpt_dir | checkpoint的加载目录 | ### 模型网络 网络部分完成了网络的组网操作,GPT在[PaddleFleetX/ppfleetx/models/language_model/gpt/dygraph/single_model.py]((https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/dygraph/single_model.py))下。 可以使用配置文件配置模型的规模,如: ```yaml Model: module: "GPTModule" name: "GPT" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True recompute_granularity: no_recompute_layers: fused_linear: True fuse_attn_qkv: True sequence_parallel: False ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | module | 指定GPT模型的执行模块 | | vocab_size | 训练词表大小 | | hidden_size | 隐藏层大小 | | num_layers | transformer层数 | | num_attention_heads | attention head的数量 | | max_seq_len | 输入文本序列的长度 | | ffn_hidden_size | ffn层大小,一般为隐藏层的四倍 | | attention_probs_dropout_prob | attention中的dropout的失活率 | | max_position_embeddings | position embedding的长度 | | type_vocab_size | 词表类型 | | initializer_range | 参数初始化的范围 | | use_recompute | 是否使用recompute训练 | | recompute_granularity | recompute训练的粒度,可选 `full` `full_attn` `core_attn`,full即recompute全部transformer,full_attn表明只recompute所有self attention部分,core_attn表明只recompute `softmax(qkT)v` 部分。注:显存占用方面,`core_attn` > `full_attn` > `full`,若所选策略产生OOM错误,可以适当更改recompute_granularity | |no_recompute_layers| list of integer,标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐,但是会适当的增加显存。若训练中发现有显存富裕,可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误,可以适当减小不进行recompute的层数。 | | fused_linear | 是否使用fused_linear代替传统Linear加速训练。注:该功能需要cuda 11.6及以上编译的paddle支持。 | | fuse_attn_qkv | 是否对attention层中的qkv计算使用fuse策略以加速训练 | | sequence_parallel | 是否使用序列并行策略以加速训练。注:只有混合并行的GPT才支持该功能,它与张量模型并行共用通信组,当mp_degree=1时,序列并行策略会被强制关闭。 | | virtual_pp_degree | 虚拟流水线并行维度,该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯,所以该参数的推荐值为2。并且,只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时,才可以使用虚拟流水线并行。 | ### 数据集 数据集参数分为“Train”、“Eval”和“Test”三部分,分别对应模型预训练、离线评估、推理等三个模块。 每个模型的配置参数都包含以下内容: ```yaml Data: Train: dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 sampler: name: DistributedBatchSampler shuffle: False drop_last: True loader: num_workers: 1 return_list: False collate_fn: gpt_collate_fn ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | dataset.name | 指定自定义数据集的名称 | | input_dir | 指定输入文件,可以使用目录,指定目录时将包括目录中的所有文件 | | split | 训练集,验证集和测试集的切分比例 | | max_seq_len | 输入文本序列的长度 | | sampler.name | 指定自定义采样器的名称 | | shuffle | 是否需要在生成样本下标时打乱顺序 | | drop_last | 是否需要丢弃最后无法凑整一个mini-batch的样本 | | num_workers | 用于加载数据的子进程个数 | | return_list | 每个设备上的数据是否以list形式返回 | | collate_fn | 通过此参数指定如果将样本列表组合为mini-batch数据;支持自定义 | ### 优化器 GPT训练默认使用AdamW优化器以及cosine学习率衰减,这里通过配置文件配置优化器的参数,如: ```yaml Optimizer: name: AdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 tensor_fusion: False ``` 其中参数说明: | **参数名** | **参数释义** | |--------------|---------------------------| | name | 指定自定义优化器的名称 | | weight_decay | weight的衰减率 | | beta1 | 一阶矩估计的指数衰减率 | | beta2 | 二阶矩估计的指数衰减率 | | epsilon | 指定优化器需要优化的参数 | | lr.name | 指定自定义学习率策略的名称 | | decay_steps | 衰减的步长 | | warmup_rate | warmup 率 | | max_lr | Adam 的初始最大学习率 | | min_lr | Adam 的初始最小学习率 | | grad_clip.name | 指定自定义梯度裁剪策略的名称 | | clip_norm | 所允许的范数最大值 | | tensor_fusion | 是否使用tensor_fustion功能加速训练 | 另外,[Profiler](./hybrid_profiler.md)中还介绍了在 GPT 中开启 Profiler 并分析调试分析结果的方法及相关的参数解释。 ### 模型压缩 PaddleFleetX 集成了 PaddleSlim 中的常见的压缩方法:量化训练(Qutization Aware Training,QAT)、结构化稀疏(Structured Pruning,SP)和知识蒸馏(Knowledge Distillation,KD)。详细参数介绍见[模型压缩介绍](../../../docs/compression.md)。 ## 参考文献 - [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) - [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165.pdf) - [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) ================================================ FILE: projects/gpt/docs/auto_parallel.md ================================================ # GPT 自动并行模型训练 分布式并行训练技术使超大模型成为可能,但分布式训练程序的编写门槛较高,并行算法较为复杂,开发者需同时具有较好的工程能力和算法功底。为了降低分布式训练的难度,自动并行成为新的研究热点,受到学术界和工业界的广泛关注。自动并行通常分为半自动并行和全自动并行。半自动并行指的是开发者在单机脚本的基础上额外添加少量标注信息即可表达并行逻辑。而全自动并行则无需开发者添加任何并行逻辑,根据单机脚本自动搜索出较为高效的并行策略,实现分布式训练。 ## 参数释义 ### 全局信息 全局信息指定训练的 batch size,以及设备、随机种子等信息 ```yaml Global: device: gpu seed: 1024 global_batch_size: local_batch_size: 1 micro_batch_size: 1 ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |--------------------------------|---------------------------| | device | 设备信息 | | seed | 随机数种子 | | global_batch_size | 全局的batch size大小,即一次参数更新等效的 batch size | | local_batch_size | 每个进程训练的batch size大小 | | micro_batch_size | 每次前向计算的batch size大小 | ### Engine训练控制 Engine训练设置完成模型训练/验证/推理等过程中的参数设置,是PaddleFleetX AutoEngine的必要参数,所有使用该Engine都必须指定该配置。 其中包含的参数有: ```yaml Engine: max_steps: 500000 num_train_epochs: 1 eval_freq: 1 eval_iters: 10 test_iters: mix_precision: enable: True dtype: "float16" level: "o2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] save_load: output_dir: ./output ckpt_dir: ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |-------------------|------------------------------------------| | max_steps | 最大训练步数 | | num_train_epochs | 训练的epoch数量 | | logging_freq | 训练日志打印的频率 | | eval_freq | 模型评估间隔,以epoch为粒度 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | | enable | 是否使用混合精度的类型,可选: `True` `False` | | dtype | 使用混合精度的类型,可选: `float16` `bfloat16`| | level | 使用混合精度训练的等级,可选 `o1` `o2` `o3` | | scale_loss | 使用混合精度float16下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16/bfloat16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算。 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16/bfloat16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算。| | output_dir | 指定输出文件 | | ckpt_dir | checkpoint的加载目录 | ### 模型网络 网络部分完成了网络的组网操作,GPT在[PaddleFleetX/ppfleetx/models/language_model/gpt/auto/auto_model.py]((https://github.com/PaddlePaddle/PaddleFleetX/blob/develop/ppfleetx/models/language_model/gpt/auto/auto_model.py))下。 可以使用配置文件配置模型的规模,如: ```yaml Model: module: "GPTModuleAuto" name: "GPT" vocab_size: 50304 hidden_size: 1024 num_layers: 24 num_attention_heads: 16 ffn_hidden_size: hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: True fuse_attn_qkv: True ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |------------------------------|------------------------| | module | 指定GPT模型的执行模块 | | vocab_size | 训练词表大小 | | hidden_size | 隐藏层大小 | | num_layers | transformer层数 | | num_attention_heads | attention head的数量 | | max_seq_len | 输入文本序列的长度 | | ffn_hidden_size | ffn层大小,一般为隐藏层的四倍 | | attention_probs_dropout_prob | attention中的dropout的失活率 | | max_position_embeddings | position embedding的长度 | | type_vocab_size | 词表类型 | | initializer_range | 参数初始化的范围 | | use_recompute | 是否使用recompute训练,重计算全部transformer | | fuse_attn_qkv | 是否对attention层中qkv计算使用fuse代替传统Linear加速训练 | ### 数据集 数据集参数分为“Train”、“Eval”和“Test”三部分,分别对应模型预训练、离线评估、推理等三个模块。 每个模型的配置参数都包含以下内容: ```yaml Data: Train: collate_fn: gpt_collate_fn sample_split: 2 dataset: name: GPTDataset input_dir: ./data/ split: [949, 50, 1] max_seq_len: 1024 ``` 其中参数对应的释义如下: | **参数名** | **参数释义** | |-------------------|------------------------| | collate_fn | 通过此参数指定如果将样本列表组合为mini-batch数据;支持自定义 | | sample_split | 通过此参数dataset返回的sample被组织为(inputs,labels) | | dataset.name | 指定自定义数据集的名称 | | input_dir | 指定输入文件,可以使用目录,指定目录时将包括目录中的所有文件 | | split | 训练集,验证集和测试集的切分比例 | | max_seq_len | 输入文本序列的长度 | ### 优化器 GPT训练默认使用AdamW优化器以及cosine学习率衰减,这里通过配置文件配置优化器的参数,如: ```yaml Optimizer: name: AdamW weight_decay: 0.01 beta1: 0.9 beta2: 0.999 epsilon: 1.0e-8 lr: name: CosineAnnealingWithWarmupDecay decay_steps: 360000 warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 ``` 其中参数说明: | **参数名** | **参数释义** | |----------------|---------------------------| | name | 指定自定义优化器的名称 | | weight_decay | weight的衰减率 | | beta1 | 一阶矩估计的指数衰减率 | | beta2 | 二阶矩估计的指数衰减率 | | epsilon | 指定优化器需要优化的参数 | | lr.name | 指定自定义学习率策略的名称 | | decay_steps | 衰减的步长 | | warmup_rate | warmup 率 | | max_lr | Adam 的初始最大学习率 | | min_lr | Adam 的初始最小学习率 | | grad_clip.name | 指定自定义梯度裁剪策略的名称 | | clip_norm | 所允许的范数最大值 | ### 并行维度 当前GPT模型已适配自动并行的**半自动策略**,用户可以通过配置文件选择并行的维度。 ```yaml Distributed: dp_degree: 2 mp_degree: 2 pp_degree: 2 sharding: sharding_degree: 1 sharding_stage: 1 ``` 其中参数说明: | **参数名** | **参数释义** | |------------------|--------------------------------------| | dp_degree | 数据并行维度 | | mp_degree | 张量模型并行维度 | | pp_degree | 流水线并行维度 | | sharding_degree | 分组切分并行维度 | | sharding_stage | 切分策略;1表示仅切分优化器状态,2表示再切分梯度,3表示再切分前向参数 | ## 运行方式 本目录按照345M、1.3B和6.7B规模大小,给出32G V100环境下GPT模型半自动并行训练的策略配置如下: | 模型规模 | 训练策略 | yaml文件 | |----------|---------------------------- |----------------------------------------| | 345MB | 单卡+fp16 | pretrain_gpt_345M_single_card.yaml | | 1.3B | dp8+fp16+recompute | pretrain_gpt_1.3B_dp8.yaml | | 6.7B | sharding16+fp16+recompute | pretrain_gpt_6.7B_sharding16.yaml | 若要在显存容量更小的16G V100环境下进行GPT大模型训练,可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。 ### 策略支持 自动并行包括2种模式:半自动并行与全自动并行。 半自动并行包括了数据并行、张量模型并行、流水线并行和分组切片并行。此外还支持重计算、混合精度等策略,来减少显存占用、加速训练。**目前,GPT 模型训练可以支持任意维度的策略组合。** | | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute | |-----------------|---------------|-----------------|-------------------|-----------|-----------| | sharding stage1 | ✓ | ✓ | ✓ | ✓ | ✓ | | sharding stage2 | ✓ | ✓ | ✓ | ✓ | ✓ | | sharding stage3 | ✓ | ✓ | ✓ | ✓ | ✓ | ### 单卡训练 以单机1.3B模型训练为例,该gpt程序需要单卡32G V100以运行 **启动命令** ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 export FLAGS_USE_STANDALONE_EXECUTOR=False # 设置执行器环境变量 python ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_single_card.yaml ``` ### 单机训练 以单机1.3B模型数据并行训练为例,通过``paddle.distributed.launch``启动多进程训练,该gpt程序需要8卡32G V100以运行。 **启动命令** ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 log_dir=log_auto python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py \ -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型单机训练,可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: **启动命令** ```shell log_dir=log_auto python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py \ -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_1.3B_dp8.yaml \ -o Model.hidden_size=1024 ``` 每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到;若未指定,日志路径为`log/workerlog.x`。运行日志具体内容如下: **运行日志** ``` [INFO 2022-08-19 10:47:00,392 engine.py:461] [train] epoch: 0 step: 0 lr: 5.555556e-09 loss: 10.972320 [INFO 2022-08-19 10:47:02,858 engine.py:461] [train] epoch: 0 step: 1 lr: 8.333333e-09 loss: 10.950481 [INFO 2022-08-19 10:47:05,321 engine.py:461] [train] epoch: 0 step: 2 lr: 1.111111e-08 loss: 10.951584 [INFO 2022-08-19 10:47:07,791 engine.py:461] [train] epoch: 0 step: 3 lr: 1.388889e-08 loss: 10.954518 [INFO 2022-08-19 10:47:10,256 engine.py:461] [train] epoch: 0 step: 4 lr: 1.666667e-08 loss: 10.959060 [INFO 2022-08-19 10:47:12,725 engine.py:461] [train] epoch: 0 step: 5 lr: 1.944444e-08 loss: 10.957585 [INFO 2022-08-19 10:47:15,198 engine.py:461] [train] epoch: 0 step: 6 lr: 2.222222e-08 loss: 10.947868 [INFO 2022-08-19 10:47:17,680 engine.py:461] [train] epoch: 0 step: 7 lr: 2.500000e-08 loss: 10.939037 ``` ### 多机训练 若需要在更多机器上进行大模型训练,则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令(master节点ip为训练所用某一台机器的ip即可)。 以2机16卡32G V100上的6.7B模型分组切分并行训练为例,启动命令为: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_sharding16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型两机训练,也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_sharding16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml \ -o Model.hidden_size=2048 ``` ================================================ FILE: projects/gpt/docs/hybrid_parallel.md ================================================ # GPT 混合并行模型训练 当训练超大模型时,就必须借助混合并行策略,混合并行策略分别指数据并行、张量模型并行、流水线并行和分组切片并行。其中数据并行保存完整的模型参数并独立处理一份子数据集,以加速模型训练过程;张量模型并行将网络中的张量(Tensor)切分到不同的设备,从而降低单个设备的显存消耗;流水线并行将模型的不同层放置到不同的计算设备,降低单个计算设备的显存消耗;分组切片并行将参数和模型状态划分到不同卡上,每个GPU只保存部分副本,以减少显存占用。联合四种训练方式,可以实现更大模型、更快训练的效果。具体策略以及相关FleetAPI介绍可以参考以下教程: - [数据并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/data_parallel/index_cn.html) - [张量模型并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/model_parallel_cn.html ) - [流水线并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/pipeline_parallel_cn.html) - [分组切片并行](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/06_distributed_training/group_sharded_parallel_cn.html) ## 参数释义 ### 并行维度 当前GPT模型已适配3D混合并行,并能够在训练超大模型,用户可以通过配置文件选择并行的维度。 ```yaml Distributed: dp_degree: 2 mp_degree: 2 pp_degree: 2 sharding: sharding_degree: 1 sharding_stage: 1 sharding_offload: False reduce_overlap: False broadcast_overlap: False ``` 其中参数说明: | **参数名** | **参数释义** | |------------------|--------------------------------------| | dp_degree | 数据并行维度 | | mp_degree | 张量模型并行维度 | | pp_degree | 流水线并行维度 | | sharding_degree | 分组切分并行维度 | | sharding_stage | 切分策略;1表示仅切分优化器状态,2表示再切分梯度,3表示再切分前向参数 | | sharding_offload | CPU offload策略 | |reduce_overlap| 是否在sharding stage 2的模式下进行reduce通讯与反向计算的overlap,该策略暂时不支持sharding_offload| |broadcast_overlap| 是否在sharding stage 2的模式下进行broadcast通讯与下一个batch的 前向计算的overlap,该策略暂时不支持sharding_offload。若使用该模型,在evaluation与save之前,必须调用 `paddle.device.cuda.synchronize()` 方法| ## 运行方式 本目录中按照345M、1.3B、6.7B和175B规模大小,给出32G V100环境下GPT模型混合并行训练的策略配置如下: | 模型规模 | 训练策略 | yaml文件 | |----------|---------------------------|------------------------------| | 345M | fp16+mp8+qat | qat_gpt_345M_mp8.yaml | | 1.3B | fp16+dp8+recompute | pretrain_gpt_1.3B_dp8.yaml | | 6.7B | fp16+sharding16+recompute | pretrain_gpt_6.7B_sharding16.yaml | | 175B | fp16+mp8+pp16+recompute | pretrain_gpt_175B_mp8_pp16.yaml | 若要在显存容量更小的16G V100环境下进行GPT大模型训练,可将对应yaml文件中的`Model`-`hidden size`值改为原来的1/2即可。 ### 策略支持 飞桨的混合并行技术包括4个维度:数据并行、张量模型并行、流水线并行和分组切片并行,此外还支持重计算、offload、混合精度、序列并行等策略,来减少显存占用、加速训练。 目前,GPT模型训练已支持前3个维度的任意策略组合,但分组切片并行stage2/3仅支持与数据并行策略组合使用;详见下表。 | | data parallel | tensor parallel | pipeline parallel | pure fp16 | recompute | |-----------------|---------------|-----------------|-------------------|-----------|-----------| | sharding stage1 | ✓ | ✓ | ✓ | ✓ | ✓ | | sharding stage2 | ✓ | ㄨ | ㄨ | ✓ | ✓ | | sharding stage3 | ✓ | ㄨ | ㄨ | ✓ | ✓ | ### 单机训练 以单机1.3B模型数据并行训练为例,通过``paddle.distributed.launch``启动多进程训练,该gpt程序需要8卡32G V100以运行。 **启动命令** ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 log_dir=log_dp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ tools/train.py \ -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型单机训练,可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: **启动命令** ```shell log_dir=log_dp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ tools/train.py \ -c ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml \ -o Model.hidden_size=1024 ``` 每张GPU的运行日志`workerlog.x`可在launch命令中指定的`log_dir`路径下找到;若未指定,日志路径为`log/workerlog.x`。运行日志具体内容如下: **运行日志** ``` [2022-09-21 05:43:58,797] [ INFO] - [train] epoch: 0, batch: 0, loss: 10.992407799, avg_batch_cost: 5.51734 sec, speed: 0.18 step/s, ips_total: 11878 tokens/s, ips: 1485 tokens/s, learning rate: 2.77778e-08 [2022-09-21 05:43:59,508] [ INFO] - [train] epoch: 0, batch: 1, loss: 11.000075340, avg_batch_cost: 0.71029 sec, speed: 1.41 step/s, ips_total: 92267 tokens/s, ips: 11533 tokens/s, learning rate: 4.16667e-08 [2022-09-21 05:44:00,242] [ INFO] - [train] epoch: 0, batch: 2, loss: 11.017463684, avg_batch_cost: 0.73301 sec, speed: 1.36 step/s, ips_total: 89406 tokens/s, ips: 11176 tokens/s, learning rate: 5.55556e-08 [2022-09-21 05:44:00,965] [ INFO] - [train] epoch: 0, batch: 3, loss: 10.983654976, avg_batch_cost: 0.72319 sec, speed: 1.38 step/s, ips_total: 90620 tokens/s, ips: 11328 tokens/s, learning rate: 6.94444e-08 [2022-09-21 05:44:01,678] [ INFO] - [train] epoch: 0, batch: 4, loss: 11.014451981, avg_batch_cost: 0.71223 sec, speed: 1.40 step/s, ips_total: 92016 tokens/s, ips: 11502 tokens/s, learning rate: 8.33333e-08 [2022-09-21 05:44:02,385] [ INFO] - [train] epoch: 0, batch: 5, loss: 11.005180359, avg_batch_cost: 0.70707 sec, speed: 1.41 step/s, ips_total: 92687 tokens/s, ips: 11586 tokens/s, learning rate: 9.72222e-08 [2022-09-21 05:44:03,100] [ INFO] - [train] epoch: 0, batch: 6, loss: 10.989698410, avg_batch_cost: 0.71402 sec, speed: 1.40 step/s, ips_total: 91785 tokens/s, ips: 11473 tokens/s, learning rate: 1.11111e-07 [2022-09-21 05:44:03,806] [ INFO] - [train] epoch: 0, batch: 7, loss: 10.992337227, avg_batch_cost: 0.70554 sec, speed: 1.42 step/s, ips_total: 92888 tokens/s, ips: 11611 tokens/s, learning rate: 1.25000e-07 [2022-09-21 05:44:04,516] [ INFO] - [train] epoch: 0, batch: 8, loss: 10.972790718, avg_batch_cost: 0.71011 sec, speed: 1.41 step/s, ips_total: 92290 tokens/s, ips: 11536 tokens/s, learning rate: 1.38889e-07 [2022-09-21 05:44:05,228] [ INFO] - [train] epoch: 0, batch: 9, loss: 10.983499527, avg_batch_cost: 0.71128 sec, speed: 1.41 step/s, ips_total: 92138 tokens/s, ips: 11517 tokens/s, learning rate: 1.52778e-07 ``` ### 多机训练 若需要在更多机器上进行大模型训练,则需要在每个参与训练的节点上设置master节点ip/port信息后执行启动命令(master节点ip为训练所用某一台机器的ip即可)。 以2机16卡32G V100上的6.7B模型分组切分并行训练为例,启动命令为: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_sharding16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" \ tools/train.py -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml ``` 若要在显存容量更小的16G V100环境下进行GPT模型两机训练,也可通过减小`Model.hidden_size`调整模型规模至合适大小再启动训练,命令如下: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_sharding16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=2 --devices "0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/nlp/gpt/pretrain_gpt_6.7B_sharding16.yaml \ -o Model.hidden_size=2048 ``` 若要执行16机175B大模型混合并行训练,以运行启动命令为: ```shell master_ip=master节点ip master_port=可用的空闲端口号 log_dir=log_mp8_pp16 python -m paddle.distributed.launch --log_dir $log_dir \ --master=$master_ip:$master_port --nnodes=16 --devices "0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/nlp/gpt/pretrain_gpt_175B_mp8_pp16.yaml ``` 当节点较多时,可以考虑使用 `ssh` 脚本或 `mpirun` 进行跨节点命令分发。 ### 量化训练 若需要对模型进行量化训练,按照以上在配置文件中添加量化参数,可参考`qat_gpt_345M_mp8.yaml`,量化训练时可以可以适当减少训练轮数和学习率。以单机345M模型模型并行训练为例,通过``paddle.distributed.launch``启动多进程训练,该gpt程序需要8卡32G V100以运行,命令如下: ```shell log_dir=log_mp8 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/nlp/gpt/qat_gpt_345M_mp8.yaml -o Engine.max_steps=100000 \ -o Optimizer.lr.decay_steps=72000 \ -o Optimizer.lr.max_lr=5.0e-6 \ -o Optimizer.lr.min_lr=1.0e-6 ``` # GPT Zero-shot 文本生成 ## 参数释义 ```yaml Generation: top_k: 50 top_p: 0.75 temperature: 1.0 min_dec_len: 1 max_dec_len: 200 num_return_sequences: 1 decode_strategy: "sampling" ``` 其中参数说明: | **参数名** | **参数释义** | |--------------|---------------------------| | top_k | 每次为采样挑选保留分数最高的 k 个 token | | top_p | 如果设置小于 1.0 的小数,则保留加起来为 top_p 或更高的最可能的概率的 token。默认值为 1.0 | | temperature | 调节下一个 token 的概率温度,logits = logits / temperature,默认值为 1.0 | | min_dec_len | 最小生成 token 长度 | | max_dec_len | 最大生成 token 长度 | | num_return_sequences | 每个输入生成的序列个数,默认值为 1 | | decode_strategy | 解码策略,默认值为 "sampling",目前只支持 "sampling",未来会支持 "greedy_search","beam_search" | ## 文本生成 下载预训练好的模型,快速体验文本生成 ```shell cd PaddleFleetX # 如果已在 PaddleFleetX 根目录下,则忽略 mkdir -p ckpt wget -O ckpt/GPT_345M.tar.gz https://paddlefleetx.bj.bcebos.com/model/nlp/gpt/GPT_345M.tar.gz tar -xzf ckpt/GPT_345M.tar.gz -C ckpt/ # --devices 根据并行策略设置设备 python -m paddle.distributed.launch --devices "0" tasks/gpt/generation.py \ -c ppfleetx/configs/nlp/gpt/generation_gpt_345M_dp8.yaml \ -o Engine.save_load.ckpt_dir=./ckpt/PaddleFleetX_GPT_345M_220826/ # 生成的文本,由于 checkpoint 不同,超参不同,随机数不同,您执行可能会生成不一样的内容 Prompt: Hi, GPT2. Tell me who Jack Ma is. Generation: Hi, GPT2. Tell me who Jack Ma is. I don’t want to hear that.” For now, the only question the crowd is asking is whether or not Jack Ma will step down from the board of directors of Alibaba. Jack Ma on why he never wanted to run for President in 2016: There were two reasons. One is that I wanted to spend more time with my family. I thought it was better to spend more time with my family and spend more time with my children. So it was a very personal reason. But the second reason was that I thought it would be difficult to get elected, because there are a lot of political interests in this country. So I thought it was better to spend more time with my family. On how Alibaba will evolve into a new player in China’s transportation and logistics sector: I think that we are going to become a very important player in the logistics industry. So our strategy is to make it easy for people to travel. ``` ### 剖析体验文本生成 #### GPT 文本生成模块初始化 ```python module = build_module(cfg) module.model.eval() ``` #### 预训练模型加载 ```python # 获取到预训练 checkpoint 的根目录 ckpt_dir = cfg.Engine.save_load.ckpt_dir # 构造出具体路径 model_path = os.path.join(ckpt_dir, "model.pdparams") # 加载模型参数 model_dict = paddle.load(model_path) # FP16 模型参数转成 FP32 模型参数 for key, value in model_dict.items(): model_dict[key] = model_dict[key].astype(paddle.float32) # 设置模型参数为预训练参数 module.model.set_state_dict(model_dict) ``` #### 文本生成与结果展示 ```python input_text = "Historical Records: Tell us about the history of the Great Wall." result = module.generate(input_text) print(f'Prompt: {input_text}') print(f'Generation: {result[0]}') ``` ================================================ FILE: projects/gpt/docs/hybrid_profiler.md ================================================ # Profiler 本文档主要包括在 GPT 中开启 Profiler 并分析调试分析结果的方法,在模型开发中使用 Profiler 分析工具的方法请参考[教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)和[API文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/profiler/Profiler_cn.html)。 ## 参数配置 使用 Profiler 功能需要在任务配置文件中添加 Profiler 配置信息并确保字段为 `enable: True` 以开启分析器。 完整的可配置参数如下所示,可以根据使用场景调整配置。 ``` Profiler: enable: True scheduler: [1, 5] profiler_log: log_path detailed: True record_shapes: True profile_memory: True summary: overview: True device: True model: True dist: True kernel: True op: True mem: True memcpy: True ``` 其中参数说明: | **参数名** | **参数释义** | **默认值** | |------------------------------|------------------------|------------------------| | enable | 是否开启 Profiler | False | | scheduler | 定义分析区间,如 [1, 5] 记录 step 1 到 step 4 的分析数据 | None | | profiler_log | 日志文件目录 | profiler_log | | detailed | 是否显示详细信息 | False | | record_shapes | 是否记录 tensor shape 相关信息 | True | | profile_memory | 是否统计 memory 相关信息 | True | 其中,当 detailed=True 时会打印所有 summary 表格数据,当 detailed=False 时用户可以根据以下说明定制需要展示的表格信息。 | **参数名** | **参数释义** | **默认值** | |------------------------------|------------------------|------------------------| | summary.overview | 显示每种类型的 Event 时间消耗 | True | | summary.device | 显示 CPU 和 GPU 的平均利用率信息 | False | | summary.model | 显示模型 dataloader、forward、backward、optimization 时间消耗 | True | | summary.dist | 显示计算、通信以及重叠时间 | False | | summary.kernel | 显示 GPU 执行的 kernel 信息 | True | | summary.op | 显示框架中算子 (op) 的执行信息 | True | | summary.mem | 显示内存/显存占用统计信息 | False | | summary.memcpy | 显示框架中调用内存操作所花费的时间 | False | ## 运行分析 本节以 gpt混合并行 为例,首先进入目录, ``` cd PaddleFleetX ``` 修改`ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml` 中 Profiler.enable 为 True, 同时可以根据上节说明调整相关配置,或者使用命令行参数覆盖,例如可以使用以下命令运行程序, ``` python -m paddle.distributed.launch \ ./tools/train.py -c \ ./ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml -o Profiler.enable=True ``` > 在使用 Profiler 工具进行性能分析时,建议减少 train 的步数,获得分析数据即可停止训练。 ## 结果分析 在训练结束后会有以下数据: * 根据配置信息在控制台打印 summary 表格 * 在配置的 `profiler_log` 目录保存 profiler json 文件 这里保存的 json 文件可以通过如下两种方式查看: * 在 chrome 浏览器中打开 chrome://tracing/,然后打开 json 文件查看 * 根据控制台信息安装并启动 `visualdl --logdir log_path` 然后根据提示在浏览器中**性能分析**模块查看 具体的信息含义解释以及分析方法请参考[文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/performance_improving/profiling_model.html)。 > 在使用 visualdl 时,如果 log 文件数据较大,启动会比较耗时,请耐心等待。 ## 附录 控制台打印的 summary 信息示例如下所示。 **Overview Summary** ``` ---------------------------------------------Overview Summary--------------------------------------------- Time unit: ms ------------------------- ------------------------- ------------------------- ------------------------- Event Type Calls CPU Time Ratio (%) ------------------------- ------------------------- ------------------------- ------------------------- ProfileStep 4 18591.04 100.00 CudaRuntime 87527 8555.11 46.02 Operator 21912 1883.11 10.13 UserDefined 13116 1841.33 9.90 OperatorInner 33668 1018.39 5.48 Forward 8 731.46 3.93 Backward 4 671.82 3.61 Optimization 4 315.91 1.70 Dataloader 4 1.37 0.01 ------------------------- ------------------------- ------------------------- ------------------------- Calls GPU Time Ratio (%) ------------------------- ------------------------- ------------------------- ------------------------- Kernel 16092 4924.90 26.49 Memcpy 4278 3617.26 19.46 Memset 780 2.31 0.01 Communication 192 2363.13 12.71 ------------------------- ------------------------- ------------------------- ------------------------- ``` **Model Summary** ``` -----------------------------------------------------Model Summary----------------------------------------------------- Time unit: ms --------------- ------ ----------------------------------------------- --------------------------------------------- Name Calls CPU Total / Avg / Max / Min / Ratio(%) GPU Total / Avg / Max / Min / Ratio(%) --------------- ------ ----------------------------------------------- --------------------------------------------- ProfileStep 4 18591.04 / 4647.76 / 14114.47 / 757.27 / 100.00 4924.90 / 1231.22 / 2853.61 / 682.04 / 100.00 Dataloader 4 1.37 / 0.34 / 0.85 / 0.16 / 0.01 0.00 / 0.00 / 0.00 / 0.00 / 0.00 Forward 8 731.46 / 91.43 / 133.28 / 49.03 / 3.93 714.83 / 89.35 / 174.91 / 4.72 / 14.51 Backward 4 671.82 / 167.96 / 168.29 / 167.52 / 3.61 1701.53 / 425.38 / 426.97 / 424.10 / 34.55 Optimization 4 315.91 / 78.98 / 89.07 / 73.78 / 1.70 108.27 / 27.07 / 27.09 / 27.06 / 2.20 Others - 16870.48 / - / - / - / 90.75 2400.27 / - / - / - / 48.74 --------------- ------ ----------------------------------------------- --------------------------------------------- ``` **Operator Summary** ``` ----------------------------------------------------------------Operator Summary----------------------------------------------------------------- Time unit: ms ---------------------------------------------------- ------ ----------------------------------------- ---------------------------------------- Name Calls CPU Total / Avg / Max / Min / Ratio(%) GPU Total / Avg / Max / Min / Ratio(%) ---------------------------------------------------- ------ ----------------------------------------- ---------------------------------------- -----------------------------------------------------------Thread: All threads merged------------------------------------------------------------ GradNodePyLayer_RecomputeFunction_backward 96 663.37 / 6.91 / 17.17 / 4.01 / 18.56 1629.87 / 16.98 / 17.41 / 16.69 / 26.98 TransformerDecoderLayer 96 262.68 / 2.74 / 5.91 / 1.90 / 39.60 661.18 / 6.89 / 7.11 / 6.73 / 40.57 backward 96 318.62 / 3.32 / 10.57 / 1.31 / 48.03 968.69 / 10.09 / 10.31 / 9.91 / 59.43 matmul dygraph 2312 200.13 / 0.09 / 1.61 / 0.04 / 5.60 1487.76 / 0.64 / 9.81 / 0.22 / 24.63 matmul infer_meta 964 1.42 / 0.00 / 0.01 / 0.00 / 0.71 0.00 / 0.00 / 0.00 / 0.00 / 0.00 matmul compute 964 71.38 / 0.07 / 1.59 / 0.03 / 35.67 644.02 / 0.67 / 9.81 / 0.22 / 43.29 MEMSET 192 - / - / - / - / - 0.42 / 0.00 / 0.00 / 0.00 / 0.07 volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn 384 - / - / - / - / - 199.35 / 0.52 / 0.83 / 0.22 / 30.95 volta_fp16_s884gemm_fp16_256x128_ldg8_f2f_nn 384 - / - / - / - / - 263.96 / 0.69 / 0.79 / 0.59 / 40.99 volta_h884gemm_64x128_ldg8_nn 192 - / - / - / - / - 141.13 / 0.74 / 0.92 / 0.61 / 21.91 void cutlass::Kernel 580 209.08 / 0.36 / 0.97 / 0.06 / 4.25 volta_h884gemm_64x128_ldg8_nn 288 203.89 / 0.71 / 0.92 / 0.57 / 4.14 volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nn 384 199.35 / 0.52 / 0.83 / 0.22 / 4.05 volta_h884gemm_256x64_ldg8_tn 288 149.52 / 0.52 / 0.54 / 0.45 / 3.04 void phi::funcs::VectorizedBroadcastKernel 192 122.37 / 0.64 / 0.66 / 0.60 / 2.48 void cutlass::Kernel 100 103.07 / 1.03 / 8.08 / 0.73 / 2.09 void phi::funcs::VectorizedElementwiseKernelImagen

    Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding

    * Paddle implementation of [Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding](https://arxiv.org/pdf/2205.11487.pdf). Google's Text-to-Image Diffussion Models that beats DALL-E2. ## Updates ***20/September/2022:*** The code of Text-to-image and Super Resolution model is released. ## Introduction Imagen is a text-to-image diffusion model with an unprecedented degree of photorealism and a deep level of language understanding.Imagen builds on the power of large transformer language models in understanding text and hinges on the strength of diffusion models in high-fidelity image generation.Imagen utilizes a pipeline of a base 64 × 64 model, and two text-conditional super-resolution diffusion models to upsample a 64 × 64 generated image into a 256 × 256 image, and then to 1024 × 1024 image.
    In comparison to previous text-to-image diffusion generation methods (e.g., DALL-E2) that take advantages of multi-modal embeddings such as CLIP, Imagen benefits largely from the use of large pre-trained language models.
    ## Usage ### Data preparing Imagen need text-image pairs for the training loop. For scaling purpose, we provide a [demo dataset](https://paddlefleetx.bj.bcebos.com/data/laion400m/part-00079) which textual embeddings and mask is precomputed. ``` cp part-00079 PaddleFleetX/projects/imagen ``` ### Imagen text encoder preparing Imagen need load pretrained text encoder model for the training loop. T5 and DeBERTa V2 are provided for Imagen. #### T5-11B ``` # T5 tokenizer and model was converted from Huggingface. config.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/config.json spiece.model: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/spiece.model tokenizer.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/t5/t5-11b/tokenizer.json t5 model: wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.0 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.1 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.2 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.3 wget https://fleetx.bj.bcebos.com/T5/t5-11b/t5.pd.tar.gz.4 cat t5.pd.tar.gz.* |tar -xf - put them into t5 folder like this: PaddleFleetX/projects/imagen/t5 ├── t5-11b ├── config.json ├── spiece.model ├── t5.pd └── tokenizer.json ``` #### DeBERTa V2 1.5B ``` # DeBERTa V2 tokenizer and model was converted from Huggingface. config.json: wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/config.json spm.model: wget https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/spm.model tokenizer_config.json: https://paddlefleetx.bj.bcebos.com/tokenizers/debertav2/tokenizer_config.json denerta v2 model: wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.0 wget https://fleetx.bj.bcebos.com/DebertaV2/debertav2.pd.tar.gz.1 cat debertav2.pd.tar.gz.* | tar -xf - put them into cache folder like this: PaddleFleetX/projects/imagen/cache └── deberta-v-xxlarge ├── config.json ├── debertav2.pd ├── spm.model ├── tokenizer_config.json ``` ### Train Imagen with T5-11B text encoder ``` cd PaddleFleetX/ ``` Train Imagen text-to-image 64×64 397M diffusion model with single gpu. ``` sh projects/imagen/run_text2im_397M_64x64_single_card.sh ``` Train Imagen text-to-image 64×64 397M diffusion model with 128 gpus. ``` sh projects/imagen/run_text2im_397M_64x64_dp128.sh ``` Train Imagen text-to-image 64×64 2B diffusion model with 256 gpus. - The 2B parameters diffusion model use Group Sharded data parallelism techniques to eliminate memory redundacies by partitioning the optimizer states, gradients, and parameters across multiple devices. ``` cd PaddleFleetX/ sh projects/imagen/run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh ``` ### Train DeBERTaV2 1.5B Imagen diffusion model with 8 gpus. ``` cd PaddleFleetX/ sh projects/imagen/run_text2im_64x64_DebertaV2_dp8.sh ``` ### Train Imagen Super Resolusion 256×256 diffusion model. Train Imagen Super Resolusion 256×256 diffusion model with single gpu. ``` cd PaddleFleetX/ sh projects/imagen/run_super_resolution_256_single_card.sh ``` Train Imagen Super Resolusion 256×256 diffusion model with 128 gpus. ``` cd PaddleFleetX/ sh projects/imagen/run_super_resolution_256_dp128.sh ``` Train Imagen Super Resolusion 1024×1024 diffusion model with 128 gpus. - The 1024x1024 super resolution diffusion model use checkpointing techniques to eliminate intermediate variable memory redundacies. ``` cd PaddleFleetX/ sh projects/imagen/run_super_resolution_1024_sharding128.sh ``` ## Citing Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding ``` @article{chen2022context, title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding}, author={Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J Fleet, Mohammad Norouzi}, journal={arXiv preprint arXiv:2205.11487}, year={2022} } ``` ================================================ FILE: projects/imagen/filelist/laion_400M/train ================================================ projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 projects/imagen/part-00079 ================================================ FILE: projects/imagen/run_super_resolution_1024_sharding128.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_sharding rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml \ -o Distributed.sharding.sharding_stage=2 \ -o Distributed.sharding.sharding_degree=8 \ -o Engine.mix_precision.enable=False \ -o Data.Train.loader.batch_size=1 \ -o Model.use_recompute=True \ ================================================ FILE: projects/imagen/run_super_resolution_256_dp128.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_sharding rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml \ -o Distributed.dp_degree=128 ================================================ FILE: projects/imagen/run_super_resolution_256_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python3 tools/train.py -c ppfleetx/configs/multimodal/imagen/imagen_super_resolution_256.yaml ================================================ FILE: projects/imagen/run_text2im_2B_64x64_T5-11B_sharding8_dp32.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_sharding rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/train.py \ -c ./ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_T5-11B.yaml \ -o Distributed.sharding.sharding_stage=2 \ -o Distributed.dp_degree=32 \ -o Distributed.sharding.sharding_degree=8 ================================================ FILE: projects/imagen/run_text2im_397M_64x64_dp128.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_dp128 rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ tools/train.py \ -c ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml \ -o Distributed.dp_degree=128 ================================================ FILE: projects/imagen/run_text2im_397M_64x64_single_card.sh ================================================ #! /bin/bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. python3 tools/train.py -c ppfleetx/configs/multimodal/imagen/imagen_397M_text2im_64x64.yaml ================================================ FILE: projects/imagen/run_text2im_64x64_DebertaV2_dp8.sh ================================================ #! /bin/bash # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_dp8 rm -rf $log_dir python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ tools/train.py \ -c ppfleetx/configs/multimodal/imagen/imagen_text2im_64x64_DebertaV2.yaml \ -o Distributed.dp_degree=8 ================================================ FILE: projects/moco/README.md ================================================ # MoCo ![MoCo](https://user-images.githubusercontent.com/11435359/71603927-0ca98d00-2b14-11ea-9fd8-10d984a2de45.png) This is a PaddlePaddle implementation of the [MoCov1](https://arxiv.org/abs/1911.05722), [MoCov2](https://arxiv.org/abs/2003.04297). ## Install Preparation MoCo requires `PaddlePaddle >= 2.4`. ```shell # git clone https://github.com/PaddlePaddle/PaddleFleetX.git cd /path/to/PaddleFleetX ``` All commands are executed in the `PaddleFleetX` root directory. ```shell python -m pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple ``` ## Data Preparation The imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure. ```shell ILSVRC2012 ├── train/ ├── xxx ├── val/ └── xxx ``` Then configure the path. ```shell mkdir -p dataset ln -s /path/to/ILSVRC2012 dataset/ILSVRC2012 ``` ## Unsupervised Training To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu machine, you can run the script: ### MoCo V1 (Single Node with 8 GPUs) ```shell export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml ``` ### MoCo V2 (Single Node with 8 GPUs) ```shell export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml ``` The differences between MoCo v1 and MoCo v2 are as follows: * MoCo v2 has a projector * Data augmentation * Softmax temperature * Learning rate scheduler ## Linear Classification When the unsupervised pre-training is complete, or directly download the provided pre-training checkpoint, you can use the following script to train a supervised linear classifier. ### MoCo v1 #### [Optional] Download checkpoint ```shell mkdir -p pretrained/moco/ wget -O ./pretrained/moco/mocov1_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.pdparams ``` #### Linear Classification Training (Single Node with 8 GPUs) ```shell export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \ -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov1_pt_imagenet2012_resnet50 ``` ### MoCo v2 #### [Optional] Download checkpoint ```shell mkdir -p pretrained/moco/ wget -O ./pretrained/moco/mocov2_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams ``` #### Linear Classification Training (Single Node with 8 GPUs) ```shell export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \ -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov2_pt_imagenet2012_resnet50 ``` ## Models | Model | Phase | Epochs | Top1 Acc | Checkpoint | Log | | ------- | --------------------- | ------ | -------- | ------------------------------------------------------------ | ------------------------------------------------------------ | | MoCo v1 | Unsupervised Training | 200 | - | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_pt_imagenet2012_resnet50.log) | | MoCo v1 | Linear Classification | 100 | 0.606141 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov1_lincls_imagenet2012_resnet50.log) | | MoCo v2 | Unsupervised Training | 200 | - | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.log) | | MoCo v2 | Linear Classification | 100 | 0.676595 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.log) | ## Citations ``` @Article{he2019moco, author = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick}, title = {Momentum Contrast for Unsupervised Visual Representation Learning}, journal = {arXiv preprint arXiv:1911.05722}, year = {2019}, } @Article{chen2020mocov2, author = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He}, title = {Improved Baselines with Momentum Contrastive Learning}, journal = {arXiv preprint arXiv:2003.04297}, year = {2020}, } ``` ================================================ FILE: projects/moco/run_mocov1_lincls_in1k.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \ -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov1_pt_imagenet2012_resnet50 ================================================ FILE: projects/moco/run_mocov1_pretrain_in1k.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml ================================================ FILE: projects/moco/run_mocov2_lincls_in1k.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml \ -o Model.model.base_encoder.pretrained=./pretrained/moco/mocov2_pt_imagenet2012_resnet50 ================================================ FILE: projects/moco/run_mocov2_pretrain_in1k.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml ================================================ FILE: projects/protein_folding/README.md ================================================ # Protein Folding 声明: 本项目不提供具体能运行的蛋白质结构预测程序,如果想体验直接能运行的蛋白质结构预测代码,请跳转到 [HelixFold](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold) 中运行。 本项目是一个教程,展示如何将数据并行、动态轴并行、分支并行(DP-DAP-BP)混合并行接入到 HelixFold 中。 想要在 HelixFold 中使用混合并行,则涉及到以下几个方面: * 依赖安装 * 通信初始化 * 混合并行网络模型使用 * 优化器设置 DAP 和 BP 属性 * 参数同步与梯度同步 ## 依赖安装 ```shell pip install ppfleetx ``` ## 通信初始化 ```python from ppfleetx.distributed.protein_folding import dp from ppfleetx.distributed.protein_folding.scg import scg def init_distributed_env(args): dp_rank = 0 # ID for current device in distributed data parallel collective communication group dp_nranks = 1 # The number of devices in distributed data parallel collective communication group if args.distributed: # init bp, dap, dp hybrid distributed environment scg.init_process_group(parallel_degree=[('dp', None), ('dap', args.dap_degree), ('bp', args.bp_degree)]) dp_nranks = dp.get_world_size() dp_rank = dp.get_rank_in_group() if dp_nranks > 1 else 0 if args.bp_degree > 1 or args.dap_degree > 1: assert args.seed is not None, "BP and DAP should be set seed!" return dp_rank, dp_nranks ``` ## 混合并行网络模型使用 目前,在 HelixFold 网络模型中涉及到混合并行的有 Embedding 和 Evoformer 类,因此可以将原来 HelixFold 中的 `EmbeddingsAndEvoformer` 修改为 `DistEmbeddingsAndEvoformer`。在网络模型中涉及 `DAP` 和 `BP` 的网络模型修改都在 [DistEmbeddingsAndEvoformer](../../ppfleetx/models/protein_folding/evoformer.py) 中封装, ```python from ppfleetx.models.protein_folding.evoformer import DistEmbeddingsAndEvoformer evoformer = DistEmbeddingsAndEvoformer( self.channel_num, self.config.embeddings_and_evoformer, self.global_config) ``` ## 优化器设置 DAP 和 BP 属性 由于 `DAP` 和 `BP` 在网络模型中分别切分的是中间激活值和网络计算分支,参数是没有切分的,因此在梯度同步的时候, 是需要区分同步的。我们将 `dap` 和 `bp` 属性设置在优化器参数分组中作为区分,并在后续梯度同步的时候使用。 ```python evoformer_params = [] template_and_pair_transition_params = [] other_params = [] for name, p in model.named_parameters(): if 'template_pair_stack' in name or 'pair_transition' in name: template_and_pair_transition_params.append(p) elif 'evoformer_iteration' in name or 'extra_msa_stack' in name: evoformer_params.append(p) else: other_params.append(p) parameters = [] if args.dap_degree > 1 or args.bp_degree > 1: parameters.append({'params': get_fused_params(other_params)}) parameters.append({'params': get_fused_params(evoformer_params), 'dap': True, 'bp': True}) parameters.append({'params': get_fused_params(template_and_pair_transition_params), 'dap': True}) else: parameters.append({'params': get_fused_params(other_params + evoformer_params + template_and_pair_transition_params)}) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, epsilon=1e-06, grad_clip=grad_clip, parameters = parameters ) ``` ## 参数同步与梯度同步 ### 参数同步 虽然是 `DP-DAP-BP` 混合并行,但是每个设备上的模型参数是没有切分的,因为在模型训练之前也需要做一次参数同步。 ```python from ppfleetx.distributed.protein_folding import dp model = RunModel(train_config, model_config) dp.param_sync(model, src_rank=0) ``` ### 梯度同步 如上节所述,在梯度同步的时候需要分别对 `DP`,`DAP`,`BP` 并行策略相关的模型参数的梯度进行同步。 ```python from ppfleetx.distributed.protein_folding import dap, bp, dp loss.backward() # sync the gradient for branch parallel firstly bp.grad_sync(optimizer._param_groups) # then sync the gradient for dap dap.grad_sync(optimizer._param_groups) # finally sync the gradient for ddp dp.grad_sync(optimizer._param_groups) optimizer.step() optimizer.clear_grad() ``` ## 论文引用 ``` @article{wang2022helixfold, title={HelixFold: An Efficient Implementation of AlphaFold2 using PaddlePaddle}, author={Wang, Guoxia and Fang, Xiaomin and Wu, Zhihua and Liu, Yiqun and Xue, Yang and Xiang, Yingfei and Yu, Dianhai and Wang, Fan and Ma, Yanjun}, journal={arXiv preprint arXiv:2207.05477}, year={2022} } @article{wang2022efficient_alphafold2, title={Efficient AlphaFold2 Training using Parallel Evoformer and Branch Parallelism}, author={Wang, Guoxia and Wu, Zhihua and Fang, Xiaomin and Xiang, Yingfei and Liu, Yiqun and Yu, Dianhai and Ma, Yanjun}, journal={arXiv preprint arXiv:2211.00235}, year={2022} } ``` ================================================ FILE: projects/ufo2.0/README.md ================================================ # VIMER-UFO 2.0 (文心-CV大模型) ## 整体概述 近年来预训练大模型一次次刷新记录,展现出惊人的效果,但对于产业界而言,势必要面对如何应用落地的问题。当前预训练模型的落地流程可被归纳为:针对只有少量标注数据的特定任务,使用任务数据 fine-tune 预训练模型并部署上线。然而,当预训练模型参数量不断增大后,该流程面临两个严峻的挑战。首先,随着模型参数量的急剧增加,大模型 fine-tuning 所需要的计算资源将变得非常巨大,普通开发者通常无法负担。其次,随着 AIoT 的发展,越来越多 AI 应用从云端往边缘设备、端设备迁移,而大模型却无法直接部署在这些存储和算力都极其有限的硬件上。 针对预训练大模型落地所面临的问题,百度提出统一特征表示优化技术(UFO:Unified Feature Optimization),在充分利用大数据和大模型的同时,兼顾大模型落地成本及部署效率。VIMER-UFO 2.0 技术方案的主要内容包括: * Task MoE: 飞桨多任务超网络分布式训练架构,支持训练任务动态扩展,特定任务任意切分,保证多任务之间信息有效借鉴,负载均衡,高效协同。 * All in One:行业最大 170 亿参数视觉多任务模型,覆盖人脸、人体、车辆、商品、食物细粒度分类等 20+ CV 基础任务,单模型 28 个公开测试集效果 SOTA。 * One for All:首创针对视觉多任务的超网络与训练方案,支持各类任务、各类硬件的灵活部署,解决大模型参数量大,推理性能差的问题。 ![图1:UFO整体架构](./img/UFO_v2_1.png) ## 模型效果 文心VIMER-UFO 2.0大模型是基于飞桨的Task MoE架构构建多任务超网络,模型参数量达到170亿,单模型28项公开数据集SOTA。基于飞桨Task MoE架构,可以根据任务的不同自动选择激活最优的区域,从而实现100倍参数压缩,同时支持下游任务快速扩展,是行业最大的视觉多任务统一大模型。尽管 VIMER-UFO 2.0 大模型参数量达到了170 亿,得益于 Task-MoE 稀疏结构,每个任务推理时只需激活部分参数,计算量相当于 6 亿参数模型规模,加速比接近 30 倍。更多细节请参看[VIMER-UFO 2.0](https://github.com/PaddlePaddle/VIMER/tree/develop/UFO)。 ![图2:UFO_Result](./img/UFO_v2_2.png) ## 飞桨Task MoE分布式训练架构 如此大的参数规模和任务数,给模型的训练带来了巨大的挑战。文心VIMER-UFO 2.0大模型采用稀疏门控混合专家设计,仅参数存储就需要68G,给训练时的模型存储带来了压力;该模型在前向反向时所有计算节点间会进行同步等待的All-to-All通信,使得通信负担明显加大;此外,该模型的多任务数目是动态的,且多个任务之间样本严重不均衡,使得计算节点之间的同步等待较长,影响并发效率。 针对这些挑战,飞桨提出了Task MoE分布式训练架构,不仅实现多级并行存储稀疏参数,还支持硬件拓扑感知通信,使得层次化All-to-All通信效率提升20%。同时飞桨还创新性地提出了基于Task的负载均衡机制,支持任务数量的动态扩展、特定任务的任意切分以及多个任务在不同的专家下的并发训练,同等实验环境下训练性能比PyTorch提升66%。同时,该方案保障多任务之间信息借鉴机制的有效性,使得VIMER-UFO 2.0模型精度大幅提升。此外,在推理阶段,基于飞桨Task MoE架构构建的多任务多路径的超网络,可支持任务粒度的路径选择,方便灵活部署。 ![图3:UFO_Perf](./img/UFO_Perf.png) ## 使用方案 1. 有关UFO的更多细节原理请参看[VIMER-UFO 2.0](https://github.com/PaddlePaddle/VIMER/tree/develop/UFO)。 2. VIMER-UFO 2.0 相关的模型、训练代码和评测脚本均已开源,更多细节正在逐渐完善中,了解详细信息可访问:https://github.com/PaddlePaddle/VIMER/tree/main/UFO/OneForAll ================================================ FILE: projects/vit/README.md ================================================ # Vision Transformer This project implements the (Vision Transformer) proposed by google [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929). ## How to pretrain from scratch on imagenet2012 ### Go to the main repo directory All commands are executed in the home directory. ``` cd /path/to/PaddleFleetX ``` ### Data The imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure. ``` ILSVRC2012 ├── train/ ├── train_list.txt ├── val/ └── val_list.txt ``` Then configure the path. ```shell mkdir -p dataset ln -s /path/to/ILSVRC2012 dataset/ILSVRC2012 ``` ### Train ViT-B/16 Note: ViT-B/16 needs run on 2 nodes with 16 A100 GPUs. If you only have a low-memory GPU, you can use gradient accumulation by setting `accumulate_steps` in yaml. The following commands need to be run on each node. ```shell python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml ``` ## Finetune ViT-B/16 ### [Optional] Download checkpoint ```shell mkdir -p pretrained/vit/ wget -O ./pretrained/vit/imagenet2012-ViT-B_16-224.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams ``` ### Finetune on imagenet2012 Finetune is similar to pre-training on ImageNet2012 dataset, we have provided the configured yaml file. ```shell python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml ``` ### Finetune on cifar10 Note: CIFAR10 dataset is automatically downloaded and cached. ```shell python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_cifar10_1n8c_dp_fp16o2.yaml ``` ### Quantization Aware Training on ImageNet2012 ```shell python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \ -o Model.model.drop_rate=0.0 \ -o Data.Train.sampler.batch_size=16 \ -o Optimizer.lr.learning_rate=5e-05 \ -o Optimizer.weight_decay=0.0002 ``` 量化训练的参数详细介绍见[模型压缩介绍](../../../docs/compression.md)。 ## Model | Model | Phase | Size | Dataset | Resolution | GPUs | Img/sec | Top1 Acc | Pre-trained checkpoint | Fine-tuned checkpoint | Log | |----------|----------|--------|--------------|------------|-------------|---------|----------|----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------| | ViT-B_16 | pretrain | 167MiB | ImageNet2012 | 224 | A100*N2C16 | 7350 | 74.75% | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | - | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.log) | | ViT-B_16 | finetune | 167MiB | ImageNet2012 | 384 | A100*N2C16 | 1580 | 77.68% | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.log) | | ViT-L_16 | finetune | 582MiB | ImageNet2012 | 384 | A100*N2C16 | 519 | 85.13% | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k-jax-ViT-L_16-224.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k+imagenet2012-ViT-L_16-384.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet21k+imagenet2012-ViT-L_16-384.log) | | Quantized ViT-B_16 | finetune | 167MiB | ImageNet2012 | 384 | A100*N2C16 | 1580 | 77.71% | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-384.pdparams) | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/quantized_imagenet2012-ViT-B_16-384.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/quantized_imagenet2012-ViT-B_16-384.log) | # 推理部署 参考[这里](./docs/inference.md) ================================================ FILE: projects/vit/auto_vit_patch16_224_dp8.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. log_dir=log_auto rm -rf $log_dir # tiny_patch16_224+dp8 run_pretrain python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \ ./tools/auto.py \ -c ppfleetx/configs/vis/vit/auto/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml ================================================ FILE: projects/vit/docs/inference.md ================================================ # 推理部署 模型训练完成后,可使用飞桨高性能推理引擎Paddle Inference通过如下方式进行推理部署。 ```bash sh projects/vit/run_inference_base_patch16_224.sh ``` 分解步骤如下: ## 1. 模型导出 首先将模型导出为用于部署的推理模型,可通过`tools/export.py`进行模型导出,通过`-c`指定需要导出的模型的配置文件,通过`-o Engine.save_load.ckpt_dir=`指定导出模型时使用的权重。 以`VIT-224`模型为例,通过如下方式下载PaddleFleetX发布的训练好的权重。若你已下载或使用训练过程中的权重,可跳过此步。 ```bash mkdir -p ckpt wget -O ckpt/model.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams ``` 通过如下方式进行推理模型导出 ```bash python tools/export.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml \ -o Engine.save_load.ckpt_dir=./ckpt/ ``` 导出的模型默认保存在`./output`目录,可通过配置文件中`Engine.save_load.output_dir`或通过`-o Engine.save_load.output_dir=`指定 ## 2. 推理部署 模型导出后,可通过`projects/vit/inference.py`脚本进行推理部署。 ```bash python projects/vit/inference.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml ``` ================================================ FILE: projects/vit/export_qat.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/export.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \ -o Model.model.drop_rate=0.0 \ -o Data.Train.sampler.batch_size=16 \ -o Optimizer.lr.learning_rate=5e-05 \ -o Optimizer.weight_decay=0.0002 ================================================ FILE: projects/vit/inference.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import numpy as np from PIL import Image import paddle from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../'))) from ppfleetx.utils import config from ppfleetx.distributed.apis import env from ppfleetx.utils.log import logger from ppfleetx.data import build_dataloader, tokenizers from ppfleetx.models import build_module from ppfleetx.core import EagerEngine def softmax(x): exp_x = np.exp(x) return exp_x/np.sum(exp_x) def preprocess(img_path): """preprocess Preprocess to the input. Args: img_path: Image path. Returns: Input data after preprocess. """ with open(img_path, "rb") as f: img = Image.open(f) img = img.convert("RGB") # ResizeImage img = img.resize((224,224), Image.BILINEAR) # NormalizeImage scale = np.float32(1.0/255.0) mean = [0.5, 0.5, 0.5] std = [0.5, 0.5, 0.5] shape = (1, 1, 3) mean = np.array(mean).reshape(shape).astype('float32') std = np.array(std).reshape(shape).astype('float32') img = (img * scale - mean) / std # ToNCHW img = img.transpose((2, 0, 1)) img = np.expand_dims(img, axis=0) return img if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) env.set_seed(cfg.Global.seed) np.random.seed(1) img_path = 'projects/vit/images/demo.jpg' img = preprocess(img_path) if(os.path.exists('shape.pbtxt')==False): cfg.Inference.TensorRT.collect_shape = True module = build_module(cfg) engine = EagerEngine(configs=cfg,module=module, mode='inference') outs = engine.inference([img]) cfg.Inference.TensorRT.collect_shape = False module = build_module(cfg) config.print_config(cfg) engine = EagerEngine(configs=cfg,module=module, mode='inference') outs = engine.inference([img]) res = softmax(outs['linear_99.tmp_1']) max_index = np.argmax(res, axis=-1) print("类型: ", max_index[0],) print("概率: ", res[0][max_index[0]]) ================================================ FILE: projects/vit/run_finetune.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml #python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml ================================================ FILE: projects/vit/run_finetune_fused_attention.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml \ -o Model.model.use_fused_attn=True ================================================ FILE: projects/vit/run_inference_base_patch16_224.sh ================================================ echo "step 1: download parameters" mkdir -p ckpt wget -O ckpt/model.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams echo "step 2: export model" python tools/export.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml \ -o Engine.save_load.ckpt_dir=./ckpt/ echo "step 3: run VIT inference" python projects/vit/inference.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml ================================================ FILE: projects/vit/run_pretrain.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml ================================================ FILE: projects/vit/run_pretrained_fused_attention.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml \ -o Model.model.use_fused_attn=True ================================================ FILE: projects/vit/run_qat.sh ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py \ -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml \ -o Model.model.drop_rate=0.0 \ -o Data.Train.sampler.batch_size=16 \ -o Optimizer.lr.learning_rate=5e-05 \ -o Optimizer.weight_decay=0.0002 ================================================ FILE: requirements.txt ================================================ paddleslim @ https://paddle-qa.bj.bcebos.com/PaddleSlim/paddleslim-0.0.0.dev0-py3-none-any.whl paddlenlp @ https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-ci-py3-none-any.whl requests==2.25.1 regex==2022.7.25 colorlog==6.6.0 colorama==0.4.5 omegaconf==2.2.2 tqdm>=4.62.1 pybind11==2.10.0 numpy>=1.19.5,<=1.21.6 opencv-python>=4.2.0.32 Pillow==9.3.0 blobfile==1.3.3 ================================================ FILE: setup.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from setuptools import setup, Extension, find_packages from ppfleetx.data.data_tools.cpp.compile import compile_helper compile_helper() def fetch_requirements(path): with open(path, 'r') as fd: return [r.strip() for r in fd.readlines()] install_requires = fetch_requirements('requirements.txt') setup( name='ppfleetx', version='0.0.0', description='PaddleFleetX', author='PaddlePaddle Authors', url='https://github.com/PaddlePaddle/PaddleFleetX', install_requires=install_requires, package_data={ 'ppfleetx.data.data_tools.cpp': ['fast_index_map_helpers.so'] }, packages=find_packages()) ================================================ FILE: tasks/gpt/generation.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import math import os import random import time import sys import yaml import numpy as np import paddle from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../'))) from ppfleetx.utils import config from ppfleetx.models import build_module from ppfleetx.distributed.apis import env if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) if dist.get_world_size() > 1: env.init_dist_env(cfg) env.set_seed(cfg.Global.seed) module = build_module(cfg) config.print_config(cfg) module.model.eval() ckpt_dir = cfg.Engine.save_load.ckpt_dir if ckpt_dir is not None: model_path = os.path.join(ckpt_dir, "model.pdparams") model_dict = paddle.load(model_path) for key, value in model_dict.items(): model_dict[key] = model_dict[key].astype(paddle.float32) module.model.set_state_dict(model_dict) input_text = 'Hi, GPT2. Tell me who Jack Ma is.' result = module.generate(input_text) print(f'Prompt: {input_text}') print(f'Generation: {result[0]}') ================================================ FILE: tasks/gpt/inference.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../../'))) from ppfleetx.utils import config from ppfleetx.utils.log import logger from ppfleetx.data import build_dataloader, tokenizers from ppfleetx.models import build_module from ppfleetx.core import EagerEngine from ppfleetx.distributed.apis import env if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) if dist.get_world_size() > 1: env.init_dist_env(cfg) env.set_seed(cfg.Global.seed) module = build_module(cfg) config.print_config(cfg) tokenizer = tokenizers.GPTTokenizer.from_pretrained("gpt2") engine = EagerEngine(configs=cfg, module=module, mode='inference') input_text = 'Hi, GPT2. Tell me who Jack Ma is.' input_ids = [tokenizer.encode(input_text)] outs = engine.inference([input_ids]) ids = list(outs.values())[0] out_ids = [int(x) for x in ids[0]] result = tokenizer.decode(out_ids) result = input_text + result print('Prompt:', input_text) print('Generation:', result) ================================================ FILE: tasks/gpt/run_generation.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # for single card generation export CUDA_VISIBLE_DEVICES=0 python tasks/gpt/generation.py -c ./ppfleetx/configs/nlp/gpt/generation_gpt_345M_single_card.yaml ================================================ FILE: tools/auto.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import copy import random import paddle import numpy as np import paddle.distributed as dist from paddle.distributed import fleet __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from ppfleetx.utils import config from ppfleetx.utils.log import logger from ppfleetx.models import build_module from ppfleetx.data import build_auto_dataset from ppfleetx.core import AutoEngine #init_logger() if __name__ == "__main__": args = config.parse_args() cfg = config.get_auto_config( args.config, overrides=args.override, show=False) if dist.get_world_size() > 1: fleet.init(is_collective=True) module = build_module(cfg) config.print_config(cfg) train_data = build_auto_dataset(cfg.Data, "Train") eval_data = build_auto_dataset(cfg.Data, "Eval") cfg.Optimizer.lr.update({ 'epochs': cfg.Engine.num_train_epochs, 'step_each_epoch': len(train_data) }) engine = AutoEngine(configs=cfg, module=module) if cfg.Engine.save_load.ckpt_dir is not None: engine.load() if cfg.get('Tuning', None) and cfg.Tuning.enable: engine.tune(train_data) else: engine.fit(train_dataset=train_data, valid_dataset=eval_data, epoch=cfg.Engine.num_train_epochs) ================================================ FILE: tools/auto_export.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import copy import random import paddle import numpy as np __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from ppfleetx.utils import config from ppfleetx.models import build_module from ppfleetx.core import AutoEngine if __name__ == "__main__": args = config.parse_args() cfg = config.get_auto_config( args.config, overrides=args.override, show=False) if cfg.get('Model', None) is not None: module = build_module(cfg) config.print_config(cfg) engine = AutoEngine(configs=cfg, module=module, mode="export") if cfg.Engine.save_load.ckpt_dir is not None: engine.load() engine.export() else: engine = AutoEngine(configs=cfg, mode="export") if cfg.Engine.save_load.ckpt_dir is None: raise ValueError("invalid ckpt_dir.") engine.export_from_prog() ================================================ FILE: tools/eval.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from ppfleetx.utils import config from ppfleetx.data import build_dataloader from ppfleetx.models import build_module from ppfleetx.core import EagerEngine from ppfleetx.distributed.apis import env if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) if dist.get_world_size() > 1: env.init_dist_env(cfg) env.set_seed(cfg.Global.seed) module = build_module(cfg) config.print_config(cfg) engine = EagerEngine(configs=cfg, module=module, mode='eval') valid_data_loader = build_dataloader(cfg.Data, "Eval") if cfg.Engine.save_load.ckpt_dir is not None: engine.load() engine.evaluate( valid_data_loader=valid_data_loader, epoch=cfg.Engine.num_train_epochs) ================================================ FILE: tools/export.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from ppfleetx.utils import config from ppfleetx.models import build_module from ppfleetx.core import EagerEngine from ppfleetx.distributed.apis import env if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) if dist.get_world_size() > 1: env.init_dist_env(cfg) env.set_seed(cfg.Global.seed) module = build_module(cfg) config.print_config(cfg) engine = EagerEngine(configs=cfg, module=module, mode='export') if cfg.Engine.save_load.ckpt_dir is not None: engine.load() engine.export() ================================================ FILE: tools/inference.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from ppfleetx.utils import config from ppfleetx.utils.log import logger from ppfleetx.data import build_dataloader from ppfleetx.models import build_module from ppfleetx.core import EagerEngine from ppfleetx.distributed.apis import env # init_logger() if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) if dist.get_world_size() > 1: env.init_dist_env(cfg) env.set_seed(cfg.Global.seed) module = build_module(cfg) config.print_config(cfg) engine = EagerEngine(configs=cfg, module=module, mode='inference') test_data_loader = build_dataloader(cfg.Data, "Test") for iter_id, data in enumerate(test_data_loader()): outs = engine.inference(data) if iter_id >= cfg.Engine.test_iters: break logger.info("The inference process is complete.") del test_data_loader ================================================ FILE: tools/train.py ================================================ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import sys import copy import paddle from paddle.distributed import fleet import paddle.distributed as dist __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.abspath(os.path.join(__dir__, '../'))) from ppfleetx.utils import config from ppfleetx.utils.log import logger from ppfleetx.data import build_dataloader from ppfleetx.models import build_module from ppfleetx.core import EagerEngine from ppfleetx.distributed.apis import env def set_default_flags(flags): for flag_name, flag_value in flags.items(): if os.getenv(flag_name) is None: paddle.set_flags({flag_name: flag_value}) if __name__ == "__main__": args = config.parse_args() cfg = config.get_config(args.config, overrides=args.override, show=False) paddle.set_device(cfg["Global"]["device"]) if dist.get_world_size() > 1: env.init_dist_env(cfg) env.set_seed(cfg.Global.seed) module = build_module(cfg) config.print_config(cfg) train_data_loader = build_dataloader(cfg.Data, "Train") eval_data_loader = build_dataloader(cfg.Data, "Eval") cfg.Optimizer.lr.update({ 'epochs': cfg.Engine.num_train_epochs, 'step_each_epoch': len(train_data_loader), 'total_steps': cfg.Engine.max_steps, }) engine = EagerEngine(configs=cfg, module=module) if cfg.Engine.save_load.ckpt_dir is not None: engine.load() engine.fit(train_data_loader=train_data_loader, valid_data_loader=eval_data_loader, epoch=cfg.Engine.num_train_epochs)